def survival(self): #T= length of time before the 'event' (either time to infection or time to discharge), #E indicates if infection 'event' took place T= [] E= [] for key, value in self.patients.iteritems(): #If infected at baseline - remove from analysis if value[2] == 1: pass else: T.append(value[4]) E.append(value[3]) #Kaplan Meier Estimator from lifelines package kmf= KaplanMeierFitter() E_ar = numpy.array(E) T_ar = numpy.array(T) #Fit Kaplan Meier model and plot kmf.fit(T_ar, event_observed=E_ar) if self.KM == "table": print kmf.survival_function_ elif self.KM == "plot": kmf.plot() plt.show() elif self.KM == "median": print kmf.median_ else: pass
def plot_km_estimates(self, index): # Kaplan-Meier estimations for sub group and complement rcParams['figure.figsize'] = 15, 6 plt.figure(index + 1) ax = plt.subplot(111) kmf_sg = KaplanMeierFitter() kmf_cpl = KaplanMeierFitter() kmf_sg.fit(self.sub_group['survival_times'], self.sub_group['events'], label='KM estimates for subgroup', alpha=UserInputs.kmf_alpha) kmf_sg.plot(ax=ax) kmf_cpl.fit(self.sub_group_complement['survival_times'], self.sub_group_complement['events'], label='KM estimates for complement', alpha=UserInputs.kmf_alpha) kmf_cpl.plot(ax=ax) title = self.string_repr[0] + ': ' + self.string_repr[1] plt.title(title) plt.xlabel('Time') plt.ylabel('Survival probability') fig_id = self.string_repr[0] + '_model' plt.savefig(fig_id) return
def survival_analysis(dataframe, grouping, years = 5): # remove patients with null values df2 = dataframe.dropna(subset = [grouping]) df2 = df2.dropna(subset = ['_OS']) df2 = df2.dropna(subset = ['_EVENT']) # limit analysis to number of years specified df2['survival'] = np.nan df2['event'] = np.nan maxtime = years * 365 df2['survival'][(df2['_OS'] > maxtime)] = maxtime df2['event'][(df2['_OS'] > maxtime)] = 0 df2['survival'][(df2['_OS'] <= maxtime)] = df2['_OS'] df2['event'][(df2['_OS'] <= maxtime)] = df2['_EVENT'] # get groups grouped_data = df2.groupby(grouping) unique_groups = list(grouped_data.groups.keys()) unique_groups.sort() #plot survival curve kmf = KaplanMeierFitter() ax = plt.subplot(111) for i, group in enumerate(unique_groups): data = grouped_data.get_group(group) kmf.fit(data['survival'], data['event'], label = group) # print(data['_OS']) kmf.plot(ax=ax, show_censors = True) plt.show()
def plot_kaplan_meier(self, column, value): """[plot Kaplan meier survival plots of cleaned METABRIC clinical data] Args: column ([string]): [column in METABRIC data corresponding to a patient attribute, such as her2 receptor status] value ([string or integer]): [value of column that is a point of comparision. ie column:her2_recepter value:'negative'] Plots values in column vs != values in column """ kmf = KaplanMeierFitter() treatment_df = self.data[self.data[column] == value] not_treatment_df = self.data[self.data[column] != value] treatment_months = treatment_df.overall_survival_months not_treatment_months = not_treatment_df.overall_survival_months kmf.fit(treatment_months, event_observed=treatment_df['death_from_cancer'], label=value) ax = kmf.plot() kmf2 = KaplanMeierFitter() kmf2.fit(not_treatment_months, event_observed=not_treatment_df['death_from_cancer'], label=f'not {value}') ax = kmf2.plot(ax=ax) add_at_risk_counts(kmf, kmf2, ax=ax) ax.set_ylim([0.0, 1.0]) ax.set_xlabel('Timeline (Months)') ax.set_title(f'Kaplan Meier plot in months of {column} variable') # plt.figure(dpi=350) plt.tight_layout() plt.show()
def __KM_analysis(self,duration_table,expressed_array,unexpressed_array,freq_set): data = {} expressed_T = [] expressed_C = [] unexpressed_T = [] unexpressed_C = [] for idx,row in enumerate(duration_table): if(idx>0): if row[0] in unexpressed_array and row[1] != "NA" and row[2] != "NA": unexpressed_T.append(float(row[1])) unexpressed_C.append(int(row[2])) elif row[0] in expressed_array and row[1] != "NA" and row[2] != "NA": expressed_T.append(float(row[1])) expressed_C.append(int(row[2])) results = logrank_test(expressed_T, unexpressed_T, expressed_C, unexpressed_C, alpha=.95 ) if(results.p_value < .0006): ax = plt.subplot(111) kmf = KaplanMeierFitter() kmf.fit(expressed_T, event_observed=expressed_C, label="Satisfying") kmf.plot(ax=ax, ci_force_lines=False) kmf.fit(unexpressed_T, event_observed=unexpressed_C, label="None-Satisfying") kmf.plot(ax=ax, ci_force_lines=False) plt.ylim(0,1) plt.title("Lifespans ("+str(freq_set)+")") plt.show() return results.p_value
def single_submit(form): if form.validate_on_submit(): database = form.DataBase.data Gene = form.GeneName.data low = int(form.Low.data) high = int(form.High.data) static = {} data, os, static['mean'], static['std'] = ReadData(database, Gene) num = len(os) low = max(int(num * low / 100), 1) high = max(int(num * high / 100), 1) Low, High = data[:, 1][0:low], data[:, 1][-high:] group1, group2 = data[:, 2][0:low], data[:, 2][-high:] kmf = KaplanMeierFitter() kmf.fit(Low, group1, label=Gene + '/low') ax = kmf.plot() kmf.fit(High, group2, label=Gene + '/high') kmf.plot(ax=ax) plt.savefig("static/test.png", bbox_inches='tight') plt.close() return render_template("single.html", form=form, image="test.png", refresh=np.random.randn(), static=static) else: return render_template("single.html", form=form, err=form.errors)
def plotKM(genes): extractSurvivalData() data = np.genfromtxt("data/survival_complete.txt", delimiter='\t', dtype=str) # df = load_waltons() # returns a Pandas DataFrame # print(df) df = pd.DataFrame(data, columns=['id', 'ER', 'PR', 'HER2', 'TN', 'GCH1', 'CDH1', 'CDH2', 'VIM', 'bCatenin', 'ZEB1', 'ZEB2', 'TWIST1', 'SNAI1', 'SNAI2', 'RET', 'NGFR', 'EGFR', 'AXL', 'STATUS', 'MONTHS']) kmf = KaplanMeierFitter() for i in range(0, 14): # divide the complete data set into type positive and type negative (e.g. ER+ and ER-) # data below contain the value of the gene ERP, ERN = separateLabels(df, 'ER', i, 1) # PRP, PRN = separateLabels(df, 'PR', i, 1) # HER2P, HER2N = separateLabels(df, 'HER2', i,1) # TNP, TNN = separateLabels(df, 'TN', i,1) # within each type (pos/neg), divide data into high/low gene expressions ERPH, ERPL = separateHighandLow(df, genes, i, ERP.values) # KM plot kmf.fit(ERPH[:, 2:3].astype(float), label='pos_high') ax = kmf.plot() kmf.fit(ERPL[:, 2:3].astype(float), label='pos_low') kmf.plot(ax=ax) plt.savefig("images/kmplot_" + genes[i]) plt.clf()
def kaplan_meier_curve( data_df: Union[pd.DataFrame, str], task: str = "liver", threshold: Union[float, List] = 0.5, process_dir: str = None, ): if isinstance(data_df, str): data_df = pd.read_csv(data_df) if isinstance(threshold, float): thresholds = [threshold, 1] else: thresholds = threshold thresholds.append(1) ax = plt.subplot(111) kmf = KaplanMeierFitter() prev_threshold = -1 for threshold in thresholds: name = f"{task}: {prev_threshold} < y <= {threshold}" grouped_df = data_df[(data_df[task] > prev_threshold) & (data_df[task] <= threshold)] kmf.fit(grouped_df["duration"], grouped_df["event"], label=name) kmf.plot(ax=ax) prev_threshold = threshold plt.xlabel("Follow-up time (days)") plt.ylabel("Probability of survival") if process_dir is not None: plt.tight_layout() plt.savefig(os.path.join(process_dir, f"{task}_kaplan_meier.pdf"))
def graph(months, survival_status, has_mutation, name): survival_data = pd.DataFrame({ 'OS_MONTHS': months, 'OS_STATUS': survival_status # 0 if living, 1 if dead }) #0 if don't have mutation, 1 if do have mutation in has_mutation ## create an kmf object kmf = KaplanMeierFitter() ## fit the data into a model for each group kmf.fit(survival_data.OS_MONTHS[has_mutation], survival_data.OS_STATUS[has_mutation], label="have mutation") layer1 = kmf.plot(ci_show=True) kmf.fit(survival_data.OS_MONTHS[~has_mutation], survival_data.OS_STATUS[~has_mutation], label="no mutation") layer2 = kmf.plot(ax=layer1, ci_show=True) plt.title('{} survival plot'.format(name)) ## view plot plt.show()
def surv_curve_wg(value): from scripts.transform_dataset import transurv history = join(os.path.dirname(os.getcwd()), 'Survival-analysis', 'datasets', 'filling_event.csv') machine = join(os.path.dirname(os.getcwd()), 'Survival-analysis', 'datasets', 'machine.csv') ttf = transurv(hist_url=history, mach_url=machine) ttf_ad = ttf[ttf['name'] == value] T = ttf_ad['runhour_cum'] E = ttf_ad['event'] fail_name = ttf_ad['fail_type'].unique() kmf = KaplanMeierFitter() fig = plt.figure(figsize=(15, 20)) for c, num in zip(fail_name, range(1, ttf_ad['fail_type'].nunique())): ix = ttf_ad['fail_type'] == c ax = fig.add_subplot(5, 3, num) kmf.fit(T[ix], E[ix], label=c) kmf.plot(ax=ax, legend=False) ax.set_title(c) ax.set_xlabel('runhour') ax.axhline(y=0.5, color='r', linestyle='dashed') plt.tight_layout() plt.show()
def KM_estimator(relapsed_data, censored_data): durations = relapsed_data + censored_data event_observed = list(np.ones(len(relapsed_data))) + list(np.zeros(len(censored_data))) ax = plt.subplot(111) kmf = KaplanMeierFitter() kmf.fit(durations, event_observed, label='kaplan-meier curve') axes = plt.gca() axes.set_ylim([0, 1]) axes.set_xlim([0, 86]) axes.set_position([0.16, 0.175, 0.81, 0.8]) kmf.plot(show_censors=False, censor_styles={'ms': 3, 'marker': 's'}, ci_show=True, at_risk_counts=False) plt.xlabel('Time in Months', labelpad=10, fontsize=20) #, weight='bold' plt.ylabel('Survival Probability', labelpad=10, fontsize=20) for tick in ax.xaxis.get_major_ticks(): tick.label1.set_fontsize(15) #tick.label1.set_fontweight('bold') for tick in ax.yaxis.get_major_ticks(): tick.label1.set_fontsize(15) #tick.label1.set_fontweight('bold') plt.savefig('km.pdf') plt.show()
def survival_plot_and_cox(self, df_arr, label=[], filename=''): plt.clf() color = ['red', 'green', 'blue', 'cyan', 'orange', 'black'] kmf = KaplanMeierFitter() naf = NelsonAalenFitter() for a in range(len(df_arr)): df_el = df_arr[a] if a == 0: kmf.fit(df_el['bcrmonth'], df_el['bcrstatus'], label=label[a]) ax = kmf.plot(show_censors=True, ci_show=False, color=color[a], ylim=(0, 1)) else: kmf.fit(df_el['bcrmonth'], df_el['bcrstatus'], label=label[a]) kmf.plot(ax=ax, show_censors=True, ci_show=False, color=color[a], ylim=(0, 1)) fig = ax.get_figure() fig.savefig(filename + '.png') fig.savefig(filename + '.pdf', format='PDF')
def test_kmf_add_at_risk_counts_with_single_row_multi_groups( self, block, kmf): T = np.random.exponential(10, size=(100)) E = np.random.binomial(1, 0.8, size=(100)) kmf_test = KaplanMeierFitter().fit(T, E, label="test") T = np.random.exponential(15, size=(1000)) E = np.random.binomial(1, 0.6, size=(1000)) kmf_con = KaplanMeierFitter().fit(T, E, label="con") fig = self.plt.figure() ax = fig.subplots(1, 1) kmf_test.plot(ax=ax) kmf_con.plot(ax=ax) ax.set_ylim([0.0, 1.1]) ax.set_xlim([0.0, 100]) ax.set_xlabel("Days") ax.set_ylabel("Survival probability") add_at_risk_counts(kmf_test, kmf_con, ax=ax, rows_to_show=["At risk"], ypos=-0.4) self.plt.title( "test_kmf_add_at_risk_counts_with_single_row_multi_groups") self.plt.tight_layout() self.plt.show(block=block)
def plot_survival_curves(rec_t, rec_e, antirec_t, antirec_e, experiment_name = '', output_file = None): # Set-up plots plt.figure(figsize=(12,3)) ax = plt.subplot(111) # Fit survival curves kmf = KaplanMeierFitter() kmf.fit(rec_t, event_observed=rec_e, label=' '.join([experiment_name, "Recommendation"])) kmf.plot(ax=ax,linestyle="-") kmf.fit(antirec_t, event_observed=antirec_e, label=' '.join([experiment_name, "Anti-Recommendation"])) kmf.plot(ax=ax,linestyle="--") # Format graph plt.ylim(0,1); ax.set_xlabel('Timeline (months)',fontsize='large') ax.set_ylabel('Percentage of Population Alive',fontsize='large') # Calculate p-value results = logrank_test(rec_t, antirec_t, rec_e, antirec_e, alpha=.95) results.print_summary() # Location the label at the 1st out of 9 tick marks xloc = max(np.max(rec_t),np.max(antirec_t)) / 9 if results.p_value < 1e-5: ax.text(xloc,.2,'$p < 1\mathrm{e}{-5}$',fontsize=20) else: ax.text(xloc,.2,'$p=%f$' % results.p_value,fontsize=20) plt.legend(loc='best',prop={'size':15}) if output_file: plt.tight_layout() pylab.savefig(output_file)
def test_at_risk_looks_right_when_scales_are_magnitudes_of_order_larger_single_attribute(self, block): T1 = list(map(lambda v: v.right, pd.cut(np.arange(32000), 100, retbins=False))) T2 = list(map(lambda v: v.right, pd.cut(np.arange(9000), 100, retbins=False))) T3 = list(map(lambda v: v.right, pd.cut(np.arange(900), 100, retbins=False))) T4 = list(map(lambda v: v.right, pd.cut(np.arange(90), 100, retbins=False))) T5 = list(map(lambda v: v.right, pd.cut(np.arange(9), 100, retbins=False))) kmf1 = KaplanMeierFitter().fit(T1, label="Category A") kmf2 = KaplanMeierFitter().fit(T2, label="Category") kmf3 = KaplanMeierFitter().fit(T3, label="CatB") kmf4 = KaplanMeierFitter().fit(T4, label="Categ") kmf5 = KaplanMeierFitter().fit(T5, label="Categowdary B") ax = kmf1.plot() ax = kmf2.plot(ax=ax) ax = kmf3.plot(ax=ax) ax = kmf4.plot(ax=ax) ax = kmf5.plot(ax=ax) add_at_risk_counts(kmf1, kmf2, kmf3, kmf4, kmf5, ax=ax, rows_to_show=["At risk"]) self.plt.title("test_at_risk_looks_right_when_scales_are_magnitudes_of_order_larger") self.plt.tight_layout() self.plt.show(block=block)
def KaplanMeier_dash(T, C): kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C) kmf.plot(title='Kaplan Meier fitter') kmf.plot(ci_force_lines=True, title='Kaplan Meier fitter') kmf1 = plt.gcf() pyplot(kmf1, legend=False)
def main(): args = parse_args() if args.data_dir is None: data_dir = DATA_DIR else: data_dir = Path(args.data_dir) with open(str(data_dir.joinpath(args.file_name)), 'rb') as f: inputdata_list = pickle.load(f) y_orig = inputdata_list[0] preds_bootfull = inputdata_list[1] inds_inbag = inputdata_list[2] del inputdata_list preds_bootfull_mat = np.concatenate(preds_bootfull, axis=1) inds_inbag_mat = np.array(inds_inbag).T inbag_mask = 1*np.array([np.any(inds_inbag_mat==_, axis=0) for _ in range(inds_inbag_mat.shape[0])]) preds_bootave_oob = np.divide(np.sum(np.multiply((1-inbag_mask), preds_bootfull_mat), axis=1), np.sum(1-inbag_mask, axis=1)) risk_groups = 1*(preds_bootave_oob > np.median(preds_bootave_oob)) wdf = pd.DataFrame( np.concatenate((y_orig, preds_bootave_oob[:, np.newaxis],risk_groups[:, np.newaxis]), axis=-1), columns=['status', 'time', 'preds', 'risk_groups'], index=[str(_) for _ in risk_groups] ) kmf = KaplanMeierFitter() ax = plt.subplot(111) kmf.fit(durations=wdf.loc['0','time'], event_observed=wdf.loc['0','status'], label="Low Risk") ax = kmf.plot(ax=ax) kmf.fit(durations=wdf.loc['1','time'], event_observed=wdf.loc['1','status'], label="High Risk") ax = kmf.plot(ax=ax) plt.ylim(0,1) plt.title("Kaplan-Meier Plots") plt.xlabel('Time (days)') plt.ylabel('Survival Probability')
def subsetsImpactSurvival(subsets, metadata, metacensorcol="overall_survival", metaDFDcol="death_from_disease", plot=False, title=None, rounding=2): """ subsets is a dictionary, e.g.: subsets={'cluster {}'.format(i):metadata.index.isin(fitrue.columns[kmeans.labels_==i]) for i in range(4)} """ kmf = KaplanMeierFitter() lastvalues = {} for subset in subsets: kmf.fit(metadata[metacensorcol][subsets[subset]], metadata[metaDFDcol][subsets[subset]], label=subset) lastvalues[subset] = (sum(subsets[subset]), float(kmf.survival_function_.loc[ kmf.survival_function_.last_valid_index()])) try: kmf.plot(ax=ax) except NameError: ax = kmf.plot() if title: ax.set_title(title) ax.set_ylim((0, 1)) return lastvalues, ax
def plot_Kaplan_Meier_feature(donor_dataset): '''Accepts a dataframe of donor data. For each feature (column), it plots the Kaplan-Meier curves of the donors based on whether the feature is true or false. The active donors ('censored') will be excluded from the plot. Parameters: donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'. 'Total_years' represents how many years the donors have been active. 'censored' indicates whether a donor is still active (True = active donor). Output: Kaplan-Meier plot(s). This function does not return anything. ''' T = donor_dataset['Total_years'] C = donor_dataset['censored'] features = list(donor_dataset.columns) features.remove('Total_years') features.remove('censored') features.remove('Baseline') kmf = KaplanMeierFitter() for feature in features: Above_mean = donor_dataset[feature] > donor_dataset[donor_dataset['censored'] == 0][feature].mean() fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) kmf = KaplanMeierFitter() kmf.fit(T[Above_mean], C[Above_mean], label = feature + ': Yes or > mean') kmf.plot(ax=ax, linewidth = 2) kmf.fit(T[~Above_mean], C[~Above_mean], label = feature + ': No or < mean') kmf.plot(ax=ax, linewidth = 2) ax.set_xlabel('Years', size = 10) ax.set_ylabel('Surviving donor population', size = 10) ax.set_xlim(0,40) ax.set_ylim(0, 1) ax.grid() ax.legend(loc = 'upper right', fontsize = 10) plt.show()
def test_kmf_with_interval_censoring_plotting(self, block): kmf = KaplanMeierFitter() left, right = load_diabetes()["left"], load_diabetes()["right"] kmf.fit_interval_censoring(left, right) kmf.plot(color="r") self.plt.show(block=block) return
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations=df_high.duration, event_observed=df_high.event, label='High: n = ' + str(len(df_high))) kmf_low.fit(durations=df_low.duration, event_observed=df_low.event, label="Low: n = " + str(len(df_low))) except ValueError: return ("NA", "0", "0", "0", "0") kmf_high.plot(ax=ax, color="red", show_censors=True, ci_show=False) kmf_low.plot(ax=ax, color="black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A=df_high.event, event_observed_B=df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return (p_value, hm5, hm10, lm5, lm10)
def plot_two_groups(data, t_col_name, e_col_name, g_name, alpha): ''' functino to render the 2 groups and calculate the p values ''' T = data[t_col_name] E = data[e_col_name] groups = df[g_name] # get unique groups to get 1st and 2nd groups names uniques = df[g_name].unique() ix = (groups == uniques[0]) kmf = KaplanMeierFitter() # plot first group kmf.fit(T[~ix], E[~ix], label=uniques[1]) ax = kmf.plot() # plot second group kmf.fit(T[ix], E[ix], label=uniques[0]) kmf.plot(ax=ax) # get resoults for p Values results = logrank_test(T[ix], T[~ix], E[ix], E[~ix], alpha=alpha) plt.title('p-value: {0:.4f}, alpha: {1:.2f}'.format( results.p_value, alpha))
def do_KM_analysis(durations, groups, events, group_labels, xlabel=None): fitters = list() ax_list = list() sns.set(palette = "colorblind", font_scale = 1.35, rc = {"figure.figsize": (8, 6), "axes.facecolor": ".92"}) for i, cl in enumerate(sorted(set(groups))): kmf = KaplanMeierFitter() kmf.fit(durations[groups == cl], events[groups == cl], label=group_labels[i]) fitters.append(kmf) if i == 0: ax_list.append(kmf.plot(ci_show=False)) elif i == len(group_labels)-1: kmf.plot(ax=ax_list[-1], ci_show=False) else: ax_list.append(kmf.plot(ax=ax_list[-1], ci_show=False)) add_at_risk_counts(*fitters, labels=group_labels) ax_list[-1].set_ylim(0,1.1) if xlabel is not None: ax_list[-1].set_xlabel(xlabel) multi = multivariate_logrank_test(durations, groups, events) ax_list[-1].text(0.1, 0.01, 'P-value=%.3f'% multi.p_value) if len(set(groups)) > 2: pair = pairwise_logrank_test(durations, groups, events) pair.print_summary() plt.show() return kmf
def km_curve(labels_ids, survival_dataset, tested_gene_expression_headers_columns, gene_group , k=None, label_index=None): ax = plt.subplot(111) kmf = KaplanMeierFitter() all_labels = np.array([y for x in labels_ids for y in x]) label_event_list = [] label_duration_list = [] results = [] for i, cur_labels in enumerate(labels_ids): label_event = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 4].astype(np.int32) label_duration = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 3].astype(np.int32) label_event_list.append(label_event) label_duration_list.append(label_duration) labels_c = all_labels[~np.in1d(all_labels,cur_labels) & np.in1d(all_labels, tested_gene_expression_headers_columns)] label_event_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 4].astype(np.int32) label_duration_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 3].astype(np.int32) lr_results = logrank_test(label_duration, label_duration_c, label_event, label_event_c, alpha=.95) if len(label_duration) != 0: kmf.fit(list(label_duration), event_observed=list(label_event), label="cluster {} n={}, logrank pval = {}".format(i,len(label_duration), '{0:1.3e}'.format(lr_results.p_value))) # '%.7f' % kmf.plot(ax=ax, show_censors=True) print "lrank cluster {} vs all: {}".format(i, lr_results.p_value) results.append(lr_results.p_value) for j, cur_duration in enumerate(label_duration_list[:-1]): lr_results = logrank_test(label_duration, label_duration_list[j], label_event, label_event_list[j], alpha=.95) print "lrank cluster {} vs cluster {}: {}".format(i, j, lr_results.p_value) plt.ylim(0, 1); plt.title("clustering survival analysis"); plt.savefig(os.path.join(constants.BASE_PROFILE,"output" ,"cluster_by_p_{}_{}_k={}_label_i={}_{}.png".format(constants.CANCER_TYPE, gene_group.split("/")[-1],k,label_index , time.time()))) plt.cla() return results
def test_kmf_minimum_observation_bias(): N = 250 kmf = KaplanMeierFitter() T, C = exponential_survival_data(N, 0.1, scale=10) B = 0.01 * T kmf.fit(T, C, entry=B) kmf.plot() plt.title("Should have larger variances in the tails")
def createSurvivalGraph(durations, event_observed): kmf = KaplanMeierFitter() kmf.fit(durations, event_observed) kmf.plot(ci_show=False) plt.title("Hard Drive Kaplan Meier Survival Analysis") plt.ylabel("Probability a Hard Drive Survives") plt.show()
def plot_one_groupd(data, t_col_name, e_col_name, label): ''' plost the KM for one group with given lable ''' T = data[t_col_name] E = data[e_col_name] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E, label=label) kmf.plot()
def survival_plot(df, grouping_col, time_col, event_col): grouped_df = df.groupby(grouping_col) fig, ax = plt.subplots() for name, index in grouped_df.groups.items(): kmf = KaplanMeierFitter() kmf.fit(df.loc[index, time_col], df.loc[index, event_col], label=name) kmf.plot(ax=ax) sns.despine() plt.show()
def makeKMplot(cph_input, cph_group, NN_method): k = 0 for inputs in cph_input: print(k) iter_num, followup, method, feature_list, x_trn, y_trn, s_trn, c_trn, x_tst, y_tst, s_tst, c_tst = inputs cph_head = ['S', 'E'] for f in feature_list: cph_head.append(f) cph_head.append('group') iter_num, followup, method, score_slp, divide_group_slp, score_mlp, divide_group_mlp = cph_group[ k] for method in NN_method: #make_test_df cph_data_test = [] for i in range(len(x_tst)): row = [] row.append(s_tst[i]) row.append(c_tst[i]) for j in range(0, len(feature_list)): row.append(x_tst[i][j]) if method == 'SLP': if divide_group_slp[i] == 0: row.append('d') elif divide_group_slp[i] == 1: row.append('s') elif method == 'MLP': if divide_group_mlp[i] == 0: row.append('d') elif divide_group_mlp[i] == 1: row.append('s') cph_data_test.append(row) cph_df_test = pd.DataFrame(cph_data_test, columns=cph_head) kmf = KaplanMeierFitter() if len(cph_df_test.loc[cph_df_test.group == 'd']) > 1: print('a') groups = cph_df_test["group"] T = cph_df_test["S"] E = cph_df_test["E"] ix = (groups == 'd') kmf.fit(T[~ix], E[~ix], label='survival') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='death') kmf.plot(ax=ax) plt.title( str(iter_num) + 'th trial of ' + str(followup) + 'year survival with ' + method) plt.savefig('../data/KMplot/' + str(iter_num) + 'th trial of ' + str(followup) + 'year survival with ' + method + '.png') else: pass k += 1
def plot_Consensus_top_10(big_board_df, melt_df, save=False): """ Plots the survival curve for the Consensus top-10 players, either displaying the output or saving to the `../plots` directory. Args: big_board_df (pandas DataFrame): the wide-form big board dataframe with player names and draft slots melt_df (pandas DataFrame): long-form big board dataframe with duration and censor columns save (boolean): Boolean of whether to write out the .png file in the `../plots` director or display the image Returns: None """ # consensus Top-10 Picks (By Average Ranking) top_10 = big_board_df['player'].to_list()[0:10] # create matplotlib figure with 10 subplots fig, axs = plt.subplots(nrows=10, ncols=1, sharey=True, sharex=False, figsize=(15, 32)) # loop through top 10 players plotting each to their respective subplot for player, ax in zip(top_10, axs.flatten()): # slice to individual player idx = melt_df.player == player # fit Kaplan-Meier survival model kmf = KaplanMeierFitter() kmf.fit(melt_df.duration[idx], melt_df.observed[idx]) # plot individual player's survival curve kmf.plot(ax=ax, legend=False) # format xticks, etc. ax.set(title=player, xlabel='', xlim=(0, 14), ylim=(-0.1, 1.1)) y_vals = ax.get_yticks() ax.set_yticklabels(['{:3.0f}%'.format(x * 100) for x in y_vals]) ax.set_xticks(range(0, 15)) ax.set_xticklabels(['{0}'.format(int(x)) for x in range(1, 15)]) # set title, axes, etc. fig.text(0.5, 0.001, "Draft Slot", ha="center", fontsize=18) fig.text(0.001, 0.5, "Probability Player is Still Available", va="center", rotation="vertical", fontsize=18) fig.suptitle("Survival Curve for Consensus Top-10 Picks", fontsize=35) fig.tight_layout() fig.subplots_adjust(top=0.95) # either save figure or display if save: plt.savefig('../plots/top_10.png') else: plt.show()
def cluster_kmplot(cluster_assign, surv, lr_test=True, verbose=False, tmax=-1): import seaborn as sns from lifelines import KaplanMeierFitter from lifelines.statistics import multivariate_logrank_test as multiv_lr_test import matplotlib.pyplot as plt # Initialize KM plotter kmf = KaplanMeierFitter() # Number of clusters clusters = sorted(list(cluster_assign.value_counts().index)) k = len(clusters) #Set title title = "Survival plot k = " + str(k) # Initialize KM Plot Settings fig = plt.figure(figsize=(10, 7)) ax = plt.subplot(1, 1, 1) colors = sns.color_palette('hls', k) cluster_cmap = {clusters[i]: colors[i] for i in range(k)} for clust in clusters: clust_pats = list(cluster_assign[cluster_assign == clust].index) if len(set(clust_pats) & set(surv.index)) < 2: continue clust_surv_data = surv.loc[clust_pats, :].dropna() kmf.fit(clust_surv_data.duration, clust_surv_data.observed, label='Group ' + str(clust) + ' (n=' + str(len(clust_surv_data)) + ')') kmf.plot(ax=ax, color=cluster_cmap[clust], ci_show=False) if tmax != -1: plt.xlim((0, tmax)) plt.xlabel('Time (Days)', fontsize=16) plt.ylabel('Survival Probability', fontsize=16) _ = plt.xticks(FontSize=16) _ = plt.yticks(FontSize=16) # Multivariate logrank test if lr_test: cluster_survivals = pd.concat([surv, cluster_assign], axis=1).dropna().astype(int) p = multiv_lr_test(np.array(cluster_survivals.duration), np.array(cluster_survivals[cluster_assign.name]), t_0=tmax, event_observed=np.array( cluster_survivals.observed)).p_value if verbose: print('Multi-Class Log-Rank P:', p) plt.title(title + '\np=' + repr(round(p, 4)), fontsize=20, y=1.02) else: plt.title(title, fontsize=20, y=1.02) return
def km(request, cancer, site, cut): levels = ll.objects.filter(sample__cancer_type=cancer, sample__is_tumor=True, site=site) ingroup = int(round(float(cut) * levels.count())) hgq = levels.order_by('-level')[:ingroup] lgq = levels.order_by('level')[:ingroup] days = [] death = [] group = [] for dead, alive in hgq.values_list('sample__days_to_death', 'sample__days_to_last_followup'): if dead == -1: if alive != '--': days.append(int(alive)) death.append(False) group.append('h') else: days.append(dead) death.append(True) group.append('h') for dead, alive in lgq.values_list('sample__days_to_death', 'sample__days_to_last_followup'): if dead == -1: if alive != '--': days.append(int(alive)) death.append(False) group.append('l') else: days.append(dead) death.append(True) group.append('l') from pandas import DataFrame as Df qq = Df([days, death, group]).T days = qq[0] death = qq[1] groups = qq[2] ix = (groups == 'l') kmf = KaplanMeierFitter() kmf.fit(days[~ix], death[~ix], label='high group') ax = kmf.plot() kmf.fit(days[ix], death[ix], label='low group') pic_path = '/var/www/rnaedit/static/img/km/p1.png' fig = kmf.plot(ax=ax).get_figure() fig.savefig(pic_path) s1 = Site.objects.get(key=site) mt = {'cut': cut, 'site': s1, 'ig': ingroup, 'cancer': cancer} return render(request, "km.html", { 'pic': pic_path.split('static/')[1], 'meta': mt })
def plot_kaplan_meier(kmf, cancer_type_list): kmf = KaplanMeierFitter() for c in cancer_type_list: print(c) aux = data.loc[data["project"] == c] print(aux) duration = aux["days_to_death"] observed = aux["vital_status"] # fill days_to_death of patients alive with the maximum value of patients not alive duration = duration.fillna(duration.max()) kmf.fit(duration, observed, label=c) kmf.plot(ci_show=False)
def survival(time, status, pGroups=None): kmf = KaplanMeierFitter() if pGroups is None: order = [i for i in range(2, len(time)) if time[i] != "" and status[i] != ""] t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] kmf.fit(t, s) ax = kmf.plot(color='red') return ax else: ax = None groups = [ "" for i in time] for k in range(len(pGroups)): df = pd.DataFrame() order = [i for i in pGroups[k][2] if time[i] != "" and status[i] != ""] if len(order) <= 0: continue for i in order: groups[i] = k t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] kmf.fit(t, s, label = pGroups[k][0]) if ax is None: ax = kmf.plot(color=pGroups[k][1], ci_show=False, show_censors=True) else: ax = kmf.plot(ax = ax, color=pGroups[k][1], ci_show=False, show_censors=True) order = [i for i in range(len(groups)) if groups[i] != ""] if len(order) > 0: t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] g = [int(groups[i]) for i in order] from lifelines.statistics import multivariate_logrank_test from matplotlib.legend import Legend res = multivariate_logrank_test(t, g, s) leg = Legend(ax, [], [], title = "p = %.2g" % res.p_value, loc='lower left', frameon=False) ax.add_artist(leg); return ax
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") kmf_high.plot(ax = ax, color = "red", show_censors=True, ci_show=False) kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color = 'black', fontsize = 11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def plot_Kaplan_Meier_overall(donor_dataset): '''Accepts a dataframe of donor data. Plots the overall Kaplan-Meier curve based of the lifetime of the donors. The active donors ('censored') will be excluded from the plot. Parameters: donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'. 'Total_years' represents how many years the donors have been active. 'censored' indicates whether a donor is still active (True = active donor). Output: A Kaplan-Meier plot. This function does not return anything. ''' #This produces two data frames of the columns 'Total_years' #and 'censored.' The former indicates how manay years a #donor has donoted before she/he churned. The latter indicates #whether the donor is censored (not churned). Only donor who #has churned (not censored) are used because we don't know the #'Total_years' of donors who have not churned yet. T = donor_dataset['Total_years'] C = donor_dataset['censored'] #Create KaplanMeierInstance kmf = KaplanMeierFitter() kmf.fit(T, C, label = 'Overall') #plot KM function fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) kmf.plot(ax=ax) ax.set_xlabel('Years', size = 20) ax.set_ylabel('Surviving donor population', size = 20) ax.set_xlim(0,40) ax.set_ylim(0, 1) ax.grid() ax.legend(loc = 'best', fontsize = 20) plt.show() return
def get_sa(request): dirname = os.path.dirname(os.path.dirname(__file__)).replace('\\', '/') kmffile = '/images/test1.jpg' naffile = '/images/test2.jpg' context = {} context['kmf'] = kmffile context['naf'] = naffile if not os.path.exists(dirname + kmffile) and not os.path.exists(dirname + naffile): df = load_waltons() T = df['T'] # an array of durations E = df['E'] # a either boolean or binary array representing whether the 'death' was observed (alternatively an individual can be censored) kmf = KaplanMeierFitter(alpha=0.95) kmf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None) naf = NelsonAalenFitter(alpha=0.95, nelson_aalen_smoothing=True) naf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None) kmf.plot() plt.savefig(dirname + kmffile) naf.plot() plt.savefig(dirname + naffile) # return render_to_response(template_name='sa_test.html', context=context, context_instance=RequestContext(request=request)) return render(request=request, template_name='sa_test.html', context=context)
def surAnalysis(storeId): duration = [] observed = [] for elem in survival.find({'store_id':storeId}): duration.append(elem['duration']/86400) observed.append(elem['observed']) if duration==[]: pass else: dura_obj = array(duration) obs_obj = array(observed) kmf = KaplanMeierFitter() kmf.fit(dura_obj,obs_obj) ax = kmf.plot() #ax.set_xlim(0,1) #ax.set_ylim(0.85,1.0) ax.get_figure().savefig('F:\workshop\lbs_lyf\static\images\\' + storeId) plt.close(ax.get_figure())
def __init__(self, db, male=False, female=False, other=False, both=True): self.db = db self.male = male self.female = female self.other = other self.both = both duration = [] observed = [] group = [] for elem in self.db.find(): duration.append(elem['duration'] / 86400) observed.append(elem['observed']) group.append(elem['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj, index=group_obj) DataFrame(obs_obj, index=group_obj) male = group_obj == 1 female = group_obj == 2 other = group_obj == 0 kmf = KaplanMeierFitter() kmf.fit(dura_obj, obs_obj, label='both') ax = kmf.plot() if self.male is True: kmf.fit(dura_obj[male], obs_obj[male], label='male') kmf.plot(ax=ax) if self.female is True: kmf.fit(dura_obj[female], obs_obj[female], label='female') kmf.plot(ax=ax) if self.other is True: kmf.fit(dura_obj[other], obs_obj[other], label='other') kmf.plot(ax=ax) # ax.set_xlim(19,22) # ax.set_ylim(1,2) ax.get_figure().savefig('maleAndFemale')
def plot_survival(self): df = super().load_data(col = ['YR_BRTH','AGE_DX','LATERAL','RADIATN','HISTREC','ERSTATUS','PRSTATUS','BEHANAL','HST_STGA','NUMPRIMS', 'SRV_TIME_MON', 'SRV_TIME_MON_PA', 'DTH_CLASS', 'O_DTH_CLASS', 'STAT_REC'], cond = 'SRV_TIME_MON < 1000 AND HST_STGA < 8 AND DTH_CLASS < 9 AND ERSTATUS < 4 AND PRSTATUS < 4', sample_size = 100000) kmf = KaplanMeierFitter() try: df.RADIATN = df.RADIATN.replace(7, 0) df = df[df.RADIATN < 7] except Exception as err: pass # 0-negative, 1-borderline,, 2-positive df = df[df.ERSTATUS != 4] df = df[df.ERSTATUS != 9] df.ERSTATUS = df.ERSTATUS.replace(2, 0) df.ERSTATUS = df.ERSTATUS.replace(1, 2) df.ERSTATUS = df.ERSTATUS.replace(3, 1) # 0-negative, 1-borderline,, 2-positive df = df[df.PRSTATUS != 4] df = df[df.PRSTATUS != 9] df.PRSTATUS = df.PRSTATUS.replace(2, 0) df.PRSTATUS = df.PRSTATUS.replace(1, 2) df.PRSTATUS = df.PRSTATUS.replace(3, 1) rad = df.RADIATN > 0 er = df.ERSTATUS > 0 pr = df.PRSTATUS > 0 st0 = df.HST_STGA == 0 st1 = df.HST_STGA == 1 st2 = df.HST_STGA == 2 st4 = df.HST_STGA == 4 age = df.AGE_DX < 50 #print(df.head()) #print(rad.head()) #print(er.head()) #print(st.head()) df['SRV_TIME_YR'] = df['SRV_TIME_MON'] / 12 T = df['SRV_TIME_YR'] #C = (np.logical_or(df.DTH_CLASS == 1, df.O_DTH_CLASS == 1)) C = df.STAT_REC == 4 #print(T.head(20)) #print(C.head(20)) #print(df.DTH_CLASS.head(20)) #print(df.O_DTH_CLASS.head(20)) #print(df.describe()) f, ax = plt.subplots(5, sharex=True, sharey=True) ax[0].set_title("Lifespans of cancer patients"); # radiation kmf.fit(T[rad], event_observed=C[rad], label="Radiation") kmf.plot(ax=ax[0]) #, ci_force_lines=True) kmf.fit(T[~rad], event_observed=C[~rad], label="No Radiation") kmf.plot(ax=ax[0]) #, ci_force_lines=True) # ER Status kmf.fit(T[er], event_observed=C[er], label="ER Positive") kmf.plot(ax=ax[1]) #, ci_force_lines=True) kmf.fit(T[~er], event_observed=C[~er], label="ER Negative") kmf.plot(ax=ax[1]) #, ci_force_lines=True) # PR Status kmf.fit(T[pr], event_observed=C[pr], label="PR Positive") kmf.plot(ax=ax[2]) #, ci_force_lines=True) kmf.fit(T[~pr], event_observed=C[~pr], label="PR Negative") kmf.plot(ax=ax[2]) #, ci_force_lines=True) # stage kmf.fit(T[st0], event_observed=C[st0], label="Stage 0") kmf.plot(ax=ax[3]) #, ci_force_lines=True) kmf.fit(T[st1], event_observed=C[st1], label="Stage 1") kmf.plot(ax=ax[3]) #, ci_force_lines=True) kmf.fit(T[st2], event_observed=C[st2], label="Stage 2") kmf.plot(ax=ax[3]) #, ci_force_lines=True) kmf.fit(T[st4], event_observed=C[st4], label="Stage 4") kmf.plot(ax=ax[3]) #, ci_force_lines=True) # age kmf.fit(T[age], event_observed=C[age], label="Age < 50") kmf.plot(ax=ax[4]) #, ci_force_lines=True) kmf.fit(T[~age], event_observed=C[~age], label="Age >= 50") kmf.plot(ax=ax[4]) #, ci_force_lines=True) ax[0].legend(loc=3,prop={'size':10}) ax[1].legend(loc=3,prop={'size':10}) ax[2].legend(loc=3,prop={'size':10}) ax[3].legend(loc=3,prop={'size':10}) ax[4].legend(loc=3,prop={'size':10}) ax[len(ax)-1].set_xlabel('Survival in years') f.text(0.04, 0.5, 'Survival %', va='center', rotation='vertical') plt.tight_layout() plt.ylim(0,1); plt.show() f, ax = plt.subplots(2, sharex=True, sharey=True) df.hist('SRV_TIME_YR', by=df.STAT_REC != 4, ax=(ax[0], ax[1])) ax[0].set_title('Histogram of Non Censored Patients') ax[0].set_ylabel('Number of Patients') ax[1].set_ylabel('Number of Patients') ax[1].set_title('Histogram of Censored Patients') ax[1].set_xlabel('Survival in Years') plt.show() return # second plot of survival fig, ax = plt.subplots(figsize=(8, 6)) cen = df[df.STAT_REC != 4].SRV_TIME_MON nc = df[df.STAT_REC == 4].SRV_TIME_MON cen = cen.sort_values() nc = nc.sort_values() ax.hlines([x for x in range(len(nc))] , 0, nc , color = 'b', label='Uncensored'); ax.hlines([x for x in range(len(nc), len(nc)+len(cen))], 0, cen, color = 'r', label='Censored'); ax.set_xlim(left=0); ax.set_xlabel('Months'); ax.set_ylim(-0.25, len(df) + 0.25); ax.legend(loc='best'); plt.show() return
from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt df = pd.read_csv('joined.csv.bz2', sep=',', compression='bz2', low_memory=False) # strip ' months' in column 'term' df['term'] = df['term'].map(lambda x: int(x.strip(' months'))) # prepare column 'T' for training survival model df['T'] = df['firstMissed'] / df['term'] df.loc[df['loan_status']=='Fully Paid', 'T']=1 # column 'E' seems to be column 'censored' T = df['T'] E = ~df['censored'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E) kmf.survival_function_ kmf.median_ kmf.plot() plt.show()
import pandas as pd from lifelines.utils import datetimes_to_durations from lifelines import KaplanMeierFitter df = pd.read_csv('data/parl_data.csv') df['start_date'] = pd.to_datetime(df['start_date']) df['end_date'] = pd.to_datetime(df['end_date']) df['decade'] = df['start_date'].map( lambda d: str(d.year)[:3]) T, C = datetimes_to_durations(df['start_date'], df['end_date']) df['T'] = T df['C'] = C kmf = KaplanMeierFitter() ax = subplot(111) for decade in df['decade'].unique(): ix = df['decade'] == decade kmf.fit(df.ix[ix]['T'], df.ix[ix]['C'], label=decade) if decade not in ('200', '199'): kmf.plot(ax=ax, c='#777777', ci_show=False, alpha = 0.5) else: kmf.plot(ax=ax, lw=4)
#Griffin Calme #Group 15, week 8 activity #Kaplan Meier survival curve import pandas as pd from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt kmf = KaplanMeierFitter() df = pd.DataFrame.from_csv('wk8gp15KapMeier.csv') print(df) groups = df['Group'] ix = (groups == 2) T = df['SERIAL TIME (years)'] E = df['STATUS'] kmf.fit(T[~ix], E[~ix], label='1') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='2') kmf.plot(ax=ax, ci_force_lines=False) plt.show()
def data_fit(self): user_list = [] self.hyd_events.create_index('FromUserName') self.hyd_events.create_index('Event') self.hyd_users.create_index('openid') for elem in self.hyd_events.find({'Event': 'subscribe'}): user_list.append(elem['FromUserName']) user_list = list(set(user_list)) print len(user_list) now_time = time.time() # add subscribe time # three tag: pic, text, event # format: 'user_id':'', 'sub_time':'', 'unsub_time':'', 'event':''. duration = [] observed = [] group = [] time_block = [] for elem in user_list: user_dict = {} for item in self.hyd_events.find({'FromUserName': elem}): time_block.append(item['CreateTime']) earlist = min(time_block) latest = max(time_block) sub_time = int(earlist) curt = self.hyd_events.find_one({'$and': [{'FromUserName': elem}, {'Event': 'unsubscribe'}]}) if curt is None: unsub_time = int(now_time) user_dict['observed'] = 0 else: unsub_time = int(latest) user_dict['observed'] = 1 try: user_dict['duration'] = abs(unsub_time - sub_time) except Exception, e: print e print unsub_time print sub_time check = self.hyd_users.find_one({'openid': elem}) # if gender exists, set it, if not, set gender=0, which means gender unknow try: user_dict['gender'] = check['sex'] except TypeError: user_dict['gender'] = 0 duration.append(user_dict['duration'] / 86400) observed.append(user_dict['observed']) group.append(user_dict['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj, index=group_obj) DataFrame(obs_obj, index=group_obj) male = group_obj == 1 female = group_obj == 2 other = group_obj == 0 kmf = KaplanMeierFitter() kmf.fit(dura_obj, obs_obj, label='both') ax = kmf.plot() ax.get_figure().savefig('maleAndFemale')
duration = [] observed = [] group = [] for elem in after_users.find(): #if elem['duration'] >=1500000: duration.append(elem['duration']/86400) observed.append(elem['observed']) group.append(elem['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj,index=group_obj) DataFrame(obs_obj,index=group_obj) male = group_obj ==1 female = group_obj ==2 other = group_obj ==0 kmf = KaplanMeierFitter() kmf.fit(dura_obj[male],obs_obj[male], label = 'male') ax = kmf.plot() kmf.fit(dura_obj[female],obs_obj[female], label = 'female') kmf.plot(ax=ax) kmf.fit(dura_obj,obs_obj, label = 'both') kmf.plot(ax=ax) #kmf.fit(dura_obj[other],obs_obj[other], label = 'other') #kmf.plot(ax=ax) #ax.set_xlim(19,22) #ax.set_ylim(1,2) ax.get_figure().savefig('maleAndFemale_both_17day')
def plot_survival(unique_groups, grouped_data, analysis_type, censors, ci, showplot, stat_results, time='Months'): #plot survival curve kmf = KaplanMeierFitter() fig, ax = plt.subplots() n_in_groups = [] f = open('Kaplan_%s.txt' % (analysis_type), 'a') f.write("\nPercent %s\n" % analysis_type) headers = "Group\t" for x in range(95,-1,-5): headers += str(x) + "%\t" f.write("%s\n" % headers) for i, group in enumerate(unique_groups): data = grouped_data.get_group(group) n_in_groups.append(len(data)) # Adjust survival data from days to whatever form wanted if time.lower() == 'months': survival_time = (data['survival']/(365/12)) elif time.lower() == 'years': survival_time = (data['survival']/(365)) else: survival_time = data['survival'] kmf.fit(survival_time, data['event'], label = group) # print(data[survival]) # print(kmf.survival_function_) f.write("%s\t" % group) for x in range(95, -1, -5): f.write(str(qth_survival_times(x/100, kmf.survival_function_)) + "\t") f.write("\n") kmf.plot(ax=ax, show_censors=censors, ci_show=ci, linewidth=2.5) # Make the graph pretty! textbox = dict(horizontalalignment = 'left', verticalalignment = 'bottom', fontname = 'Arial', fontsize = 18) labels = dict(horizontalalignment = 'center', verticalalignment = 'center', fontname = 'Arial', fontsize = 28) ax.grid(False) ax.set_ylim(0,1.05) ax.spines['left'].set_linewidth(2.5) ax.spines['right'].set_linewidth(2.5) ax.spines['top'].set_linewidth(2.5) ax.spines['bottom'].set_linewidth(2.5) ax.yaxis.set_tick_params(width=2.5) ax.xaxis.set_tick_params(width=2.5) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') # plt.title('%s' % (analysis_type), labels, y = 1.05) plt.xlabel('%s Post-Diagnosis' % time, labels, labelpad = 20) if analysis_type == 'survival': plt.ylabel('Overall Survival', labels, labelpad = 20) else: plt.ylabel('Relapse-Free Survival', labels, labelpad=20) plt.xticks(fontname = 'Arial', fontsize = 24) plt.yticks(fontname = 'Arial', fontsize = 24) ax.tick_params(axis='y', pad=10) ax.tick_params(axis='x', pad=10) legend = ax.legend(frameon=False,loc=3) counter=0 for label in legend.get_texts(): label.set_fontsize(20) label.set_text('%s n=%d' % (unique_groups[counter], n_in_groups[counter])) counter += 1 if len(unique_groups) == 2: plt.text(0.95, 0.05, 'p = %.2g' % (stat_results.p_value), fontname='Arial', fontsize=20, ha='right', transform=ax.transAxes) plt.tight_layout() fig.savefig('Kaplan_%s.png' % analysis_type, transparent = True) fig.savefig('Kaplan_%s.eps' % analysis_type, transparent = True) if showplot == True: plt.show() plt.close(fig)
def _plot_kmf_single(df, condition_col, survival_col, censor_col, threshold, title, xlabel, ylabel, ax, with_condition_color, no_condition_color, with_condition_label, no_condition_label, color_map, label_map, color_palette, ci_show, print_as_title): """ Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col. All inputs are required - this function is intended to be called by `plot_kmf`. """ # make color inputs consistent hex format if colors.is_color_like(with_condition_color): with_condition_color = colors.to_hex(with_condition_color) if colors.is_color_like(no_condition_color): no_condition_color = colors.to_hex(no_condition_color) ## prepare data to be plotted; producing 3 outputs: # - `condition`, series containing category labels to be plotted # - `label_map` (mapping condition values to plot labels) # - `color_map` (mapping condition values to plotted colors) if threshold is not None: is_median = threshold == "median" if is_median: threshold = df[condition_col].median() label_suffix = float_str(threshold) condition = df[condition_col] > threshold default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix) if is_median: label_suffix += " (median)" default_label_with_condition = "%s > %s" % (condition_col, label_suffix) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category": condition = df[condition_col].astype("category") if not label_map: label_map = dict() [label_map.update({condition_value: '{} = {}'.format(condition_col, condition_value)}) for condition_value in condition.unique()] if not color_map: rgb_values = sb.color_palette(color_palette, len(label_map.keys())) hex_values = [colors.to_hex(col) for col in rgb_values] color_map = dict(zip(label_map.keys(), hex_values)) elif df[condition_col].dtype == 'bool': condition = df[condition_col] default_label_with_condition = "= {}".format(condition_col) default_label_no_condition = "¬ {}".format(condition_col) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} else: raise ValueError('Don\'t know how to plot data of type\ {}'.format(df[condition_col].dtype)) # produce kmf plot for each category (group) identified above kmf = KaplanMeierFitter() grp_desc = list() grp_survival_data = dict() grp_event_data = dict() grp_names = list(condition.unique()) for grp_name, grp_df in df.groupby(condition): grp_survival = grp_df[survival_col] grp_event = (grp_df[censor_col].astype(bool)) grp_label = label_map[grp_name] grp_color = color_map[grp_name] kmf.fit(grp_survival, grp_event, label=grp_label) desc_str = "# {}: {}".format(grp_label, len(grp_survival)) grp_desc.append(desc_str) grp_survival_data[grp_name] = grp_survival grp_event_data[grp_name] = grp_event if ax: ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color) else: ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color) ## format the plot # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) y_tick_vals = ax.get_yticks() ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals]) # plot title if title: ax.set_title(title) elif print_as_title: ax.set_title(' | '.join(grp_desc)) else: [print(desc) for desc in grp_desc] # axis labels if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) ## summarize analytical version of results ## again using same groups as are plotted if len(grp_names) == 2: # use log-rank test for 2 groups results = logrank_test(grp_survival_data[grp_names[0]], grp_survival_data[grp_names[1]], event_observed_A=grp_event_data[grp_names[0]], event_observed_B=grp_event_data[grp_names[1]]) elif len(grp_names) == 1: # no analytical result for 1 or 0 groups results = NullSurvivalResults() else: # cox PH fitter for >2 groups cf = CoxPHFitter() cox_df = patsy.dmatrix('+'.join([condition_col, survival_col, censor_col]), df, return_type='dataframe') del cox_df['Intercept'] results = cf.fit(cox_df, survival_col, event_col=censor_col) results.print_summary() # add metadata to results object so caller can print them results.survival_data_series = grp_survival_data results.event_data_series = grp_event_data results.desc = grp_desc return results
def execute(): matplotlib.rc("font", size=20) engine, session = database.initialize("sqlite:///../data/isrid-master.db") # Query with Group.size may take awhile, at least for Charles # Not sure why query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject) print("Tabulating query... may take awhile for unknown reasons.") df = tabulate(query) print("Done tabulating.") print(df.describe()) database.terminate(engine, session) df = df.assign( days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived], ) df = df[0 <= df.days] rows, columns = 2, 2 grid, axes = plt.subplots(rows, columns, figsize=(15, 10)) categories = Counter(df.category) plot = 0 kmfs = [] options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False} for category, count in categories.most_common()[: rows * columns]: print("Category:", category) ax = axes[plot // columns, plot % columns] df_ = df[df.category == category] N, Ndoa = len(df_), sum(df_.doa) Srate = 100 * (1 - Ndoa / N) grp = df_[df_.size > 1] sng = df_[df_.size == 1] kmf = KaplanMeierFitter() # kmf.fit(df_.days, event_observed=df_.doa, label=category) # kmf.plot(ax=ax, ci_force_lines=True) kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups") kmf.plot(ax=ax, **options) kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles") kmf.plot(ax=ax, **options) kmfs.append(kmf) ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1])) ax.set_ylim(0, 1) ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate)) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") # ax.legend_.remove() # ax.grid(True) plot += 1 grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25) grid.tight_layout() grid.subplots_adjust(top=0.9) grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True) combined = plt.figure(figsize=(15, 10)) ax = combined.add_subplot(1, 1, 1) for kmf in kmfs[: rows * columns]: kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax) ax.set_xlim(0, 15) ax.set_ylim(0, 1) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") ax.set_title("Kaplan-Meier Survival Curves", fontsize=25) ax.grid(True) combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True) plt.show()
def plot_kmf(df, condition_col, censor_col, survival_col, threshold=None, title=None, xlabel=None, ax=None, print_as_title=False): """ Plot survival curves by splitting the dataset into two groups based on condition_col if threshold is defined, the groups are split based on being > or < condition_col if threshold == 'median', the threshold is set to the median of condition_col Parameters ---------- df: dataframe condition_col: string, column which contains the condition to split on survival_col: string, column which contains the survival time censor_col: string, threshold: int or string, if int, condition_col is thresholded, if 'median', condition_col thresholded at its median title: Title for the plot, default None ax: an existing matplotlib ax, optional, default None print_as_title: bool, optional, whether or not to print text within the plot's title vs. stdout, default False """ kmf = KaplanMeierFitter() if threshold is not None: if threshold == 'median': threshold = df[condition_col].median() condition = df[condition_col] > threshold label = '{} > {}'.format(condition_col, threshold) else: condition = df[condition_col] label = '{}'.format(condition_col) df_with_condition = df[condition] df_no_condition = df[~condition] survival_no_condition = df_no_condition[survival_col] survival_with_condition = df_with_condition[survival_col] event_no_condition = (df_no_condition[censor_col].astype(bool)) event_with_condition = (df_with_condition[censor_col].astype(bool)) kmf.fit(survival_no_condition, event_no_condition, label="") if ax: kmf.plot(ax=ax, show_censors=True, ci_show=False) else: ax = kmf.plot(show_censors=True, ci_show=False) kmf.fit(survival_with_condition, event_with_condition, label=(label)) kmf.plot(ax=ax, show_censors=True, ci_show=False) # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) no_cond_str = "# no condition {}".format(len(survival_no_condition)) cond_str = "# with condition {}".format(len(survival_with_condition)) if title: ax.set_title(title) elif print_as_title: ax.set_title("%s | %s" % (no_cond_str, cond_str)) else: print(no_cond_str) print(cond_str) if xlabel: ax.set_xlabel(xlabel) results = logrank_test(survival_no_condition, survival_with_condition, event_observed_A=event_no_condition, event_observed_B=event_with_condition) return results
print("[*] Remove #%d outliers" % (len(data_) - len(data))) N = len(df) # number of data points from lifelines import KaplanMeierFitter from lifelines import NelsonAalenFitter kmf = KaplanMeierFitter() (T, E) = zip(*data) kmf.fit(T, event_observed=E) naf = NelsonAalenFitter() naf.fit(T, event_observed=E) ax = pyplot.subplot(121) naf.plot(ax=ax) ax = pyplot.subplot(122) kmf.plot(ax=ax) print naf.cumulative_hazard_ naf.cumulative_hazard_.to_csv("naf.csv") pyplot.show() data0 = [ a for (a,b) in data if b == 0 ] data1 = [ a for (a,b) in data if b == 1 ] his0,bin_edges0 = np.histogram(data0, bins=bins0, range=(config.GAMMA, 1)) his1,bin_edges1 = np.histogram(data1, bins=bins1, range=(config.GAMMA, 1)) his = np.append(his0, his1) ps = []