def test_pairwise_allows_dataframes(): N = 100 df = pd.DataFrame(np.empty((N, 3)), columns=["T", "C", "group"]) df["T"] = np.random.exponential(1, size=N) df["C"] = np.random.binomial(1, 0.6, size=N) df["group"] = np.random.binomial(2, 0.5, size=N) stats.pairwise_logrank_test(df['T'], df["group"], event_observed=df["C"])
def significance_results(self, dataframe, durations, group, event_observed=None): """ Fit the model to right censored data and plot survival function Parameters ---------- dataframe: Pandas dataframe being fit. Should have columns with durations, group and event observed. durations: Time to event for data point. For example, if the event is defined as "when supply for a clothing line is over" and say Women's size Small Tshirt supplies last for 45 days, then duration is 45. group: Column of dataframe that represents strata of dataframe (such as clusters or customer segments) event_observed: Whether event was observed (True) or not (False). Returns ------- Table showing significance results at the 0.05 level. """ df_pairwise = pd.DataFrame() if event_observed is not None: df_pairwise = pairwise_logrank_test( dataframe['durations'], dataframe['segments'], dataframe['event_observed']).summary[['p']] else: df_pairwise = pairwise_logrank_test( dataframe['durations'], dataframe['segments']).summary[['p']] df_pairwise.columns = ['p-value'] df_pairwise = df_pairwise.style.applymap(color_negative_red) return df_pairwise
def do_KM_analysis(durations, groups, events, group_labels, xlabel=None): fitters = list() ax_list = list() sns.set(palette = "colorblind", font_scale = 1.35, rc = {"figure.figsize": (8, 6), "axes.facecolor": ".92"}) for i, cl in enumerate(sorted(set(groups))): kmf = KaplanMeierFitter() kmf.fit(durations[groups == cl], events[groups == cl], label=group_labels[i]) fitters.append(kmf) if i == 0: ax_list.append(kmf.plot(ci_show=False)) elif i == len(group_labels)-1: kmf.plot(ax=ax_list[-1], ci_show=False) else: ax_list.append(kmf.plot(ax=ax_list[-1], ci_show=False)) add_at_risk_counts(*fitters, labels=group_labels) ax_list[-1].set_ylim(0,1.1) if xlabel is not None: ax_list[-1].set_xlabel(xlabel) multi = multivariate_logrank_test(durations, groups, events) ax_list[-1].text(0.1, 0.01, 'P-value=%.3f'% multi.p_value) if len(set(groups)) > 2: pair = pairwise_logrank_test(durations, groups, events) pair.print_summary() plt.show() return kmf
def test_pairwise_logrank_test_with_identical_data_returns_inconclusive(): t = np.random.exponential(10, size=100) T = np.tile(t, 3) g = np.array([1, 2, 3]).repeat(100) R = stats.pairwise_logrank_test(T, g, alpha=0.99).applymap( lambda r: r.is_significant if r is not None else None) V = np.array([[None, False, False], [False, None, False], [False, False, None]]) npt.assert_array_equal(R.values, V)
def test_pairwise_allows_dataframes_and_gives_correct_counts(): N = 100 N_groups = 5 df = pd.DataFrame(np.empty((N, 3)), columns=["T", "C", "group"]) df["T"] = np.random.exponential(1, size=N) df["C"] = np.random.binomial(1, 0.6, size=N) df["group"] = np.tile(np.arange(N_groups), 20) R = stats.pairwise_logrank_test(df["T"], df["group"], event_observed=df["C"]) assert R.summary.shape[0] == N_groups * (N_groups - 1) / 2
def comparisons(df, print=True): res = pairwise_logrank_test( event_durations=df.time, event_observed=df.spotted, groups=df.domain_system, ) if print: res.print_summary() return res
def kaplanmeier_stats_multi(dataframe, grouping, analysis_type): results = pairwise_logrank_test(event_durations=dataframe['survival'], groups=dataframe[grouping], event_observed=dataframe['event'], alpha=0.95, t_0=-1, bonferri=True) with open('Kaplan_%s.txt' % (analysis_type), 'w') as f: f.write('Test\tP-Value\tSignificant\n') for x in results: for y in results[x]: # print(y) if y != None: # print(y.test_name) # print(y.p_value) # print(y.is_significant) f.write('%s\t%4g\t%s\n' % (y.test_name, y.p_value, y.is_significant))
def aggragate_clusters_on_pvalue(clusters, survival, status): new_clusters = clusters.copy() p_values = [] many = [] layers = [] while len(np.unique(new_clusters)) > 1: numbers_at_risk.append([]) res_pair = pairwise_logrank_test(survival, new_clusters, status) pairs = {} for i in range(res_pair.shape[0]): for j in range(i + 1, res_pair.shape[0]): pairs[res_pair.index[i], res_pair.columns[j]] = res_pair.iloc[i, j].p_value sorted_x = sorted(pairs.items(), key=operator.itemgetter(1)) e = sorted_x[-1] aux = new_clusters.copy() aux[np.where(new_clusters == e[0][1])[0]] = e[0][0] layers.append(aux) new_clusters = aux.copy() res = multivariate_logrank_test(survival, new_clusters, status) many.append(len(np.unique(new_clusters))) p_values.append(res.p_value) return zip(layers, many, p_values)
plt.xticks(np.arange(0, 30, step = 5)) plt.title("Time spent on Billboard Hot 100 Chart before first exit") plt.savefig('Billboard Hot 100 - Kaplan-Meier Plot by Year 2014 to 2017.png') plt.show() #test if the years 2017 and 2016 are different to a statistically significant level results_2017_2016 = logrank_test(recent_years['Total Weeks in First Appearance'][year_mask_2017], recent_years['Total Weeks in First Appearance'][year_mask_2016], event_observed_A=recent_years['Event Observed'][year_mask_2017], event_observed_B=recent_years['Event Observed'][year_mask_2016]) #results_2017_2016.print_summary() #They give a p-value of .2495 so are not statistically significant #test if the years 2016 and 2015 are different to a statistically significant level results_2016_2015 = logrank_test(recent_years['Total Weeks in First Appearance'][year_mask_2016], recent_years['Total Weeks in First Appearance'][year_mask_2015], event_observed_A=recent_years['Event Observed'][year_mask_2016], event_observed_B=recent_years['Event Observed'][year_mask_2015]) #results_2016_2015.print_summary() #They give a p-value of .5160 so are not statistically significant results_pw = pairwise_logrank_test(recent_years['Total Weeks in First Appearance'], recent_years['Entry Year'], recent_years['Event Observed']) print(results_pw.iloc[0,1]) print(results_pw.iloc[0,2]) print(results_pw.iloc[0,3]) print(results_pw.iloc[1,2]) print(results_pw.iloc[1,3]) print(results_pw.iloc[2,3]) #EXAMINE DATA BY DECADE plt.figure(3) ax = plt.subplot(111) year_mask_201 = (by_song['Entry Decade'] == 201) kmf_201 = KaplanMeierFitter()
def test_pairwise_waltons_dataset_is_significantly_different(): waltons_dataset = load_waltons() R = stats.pairwise_logrank_test(waltons_dataset['T'], waltons_dataset['group']) assert R.values[0, 1].is_significant
def do_cum_incidence(gtdf, label): # Calculate cumulative wing extension since last laser on. # (This is ugly...) gtdf['cum_wei_this_trial'] = 0 gtdf['head_trial'] = 0 gtdf['thorax_trial'] = 0 obj_ids = gtdf['obj_id'].unique() for obj_id in obj_ids: prev_laser_state = 0 cur_cum_this_trial = 0 cur_head_trial = 0 cur_thorax_trial = 0 cur_ttm = 0 cur_state = 'none' prev_t = -np.inf for rowi in range(len(gtdf)): row = gtdf.iloc[rowi] if row['obj_id'] != obj_id: continue assert row['t'] > prev_t prev_t = row['t'] if prev_laser_state == 0 and row['laser_state']: # new laser pulse, reset cum cur_cum_this_trial = 0 if row['ttm'] < 0: cur_head_trial += 1 cur_state = 'head' elif row['ttm'] > 0: cur_thorax_trial += 1 cur_state = 'thorax' prev_laser_state = row['laser_state'] if row['zx'] > 0: cur_cum_this_trial = 1 #gtdf['cum_wei_this_trial'] = cur_cum_this_trial row['cum_wei_this_trial'] = cur_cum_this_trial if cur_state == 'head': row['head_trial'] = cur_head_trial elif cur_state == 'thorax': row['thorax_trial'] = cur_thorax_trial gtdf.iloc[rowi] = row fig_cum = plt.figure('cum indx: %s' % label, figsize=(6, 5)) ax_cum = fig_cum.add_subplot(111) pulse_nums = [1, 2, 3] this_data = {} head_data = [] thorax_data = [] sdx_e = [] sdx_g = [] sdx_c = [] for pulse_num in pulse_nums: head_pulse_df = gtdf[gtdf['head_trial'] == pulse_num] thorax_pulse_df = gtdf[gtdf['thorax_trial'] == pulse_num] h = _do_group(head_pulse_df) t = _do_group(thorax_pulse_df) sdx_e, sdx_g, sdx_c = _update_latencies(head_pulse_df, sdx_e, sdx_g, sdx_c, 'head') sdx_e, sdx_g, sdx_c = _update_latencies(thorax_pulse_df, sdx_e, sdx_g, sdx_c, 'thorax') this_data['head%d' % pulse_num] = h this_data['thorax%d' % pulse_num] = t head_data.append(h) thorax_data.append(t) plot_cum( ax_cum, h, #label='head %d'%pulse_num, lw=0.5, color=COLORS[HEAD]) plot_cum( ax_cum, t, #label='thorax %d'%pulse_num, lw=0.5, color=COLORS[THORAX]) all_head_data = combine_cum_data(head_data) all_thorax_data = combine_cum_data(thorax_data) plot_cum(ax_cum, all_head_data, label='head', lw=2, color=COLORS[HEAD]) plot_cum(ax_cum, all_thorax_data, label='thorax', lw=2, color=COLORS[THORAX]) ax_cum.legend() sdx_e = np.array(sdx_e) sdx_g = np.array(sdx_g) sdx_c = np.array(sdx_c) buf = '' alpha = 0.95 try: S, P, T = pairwise_logrank_test(sdx_e, sdx_g, sdx_c, alpha=alpha) except np.linalg.linalg.LinAlgError: buf += 'numerical errors computing logrank test' else: buf += '<h1>%s</h1>\n' % label buf += '<h2>pairwise logrank test</h2>\n' buf += ' analyses done using the <a href="http://lifelines.readthedocs.org">lifelines</a> library\n' buf += P._repr_html_() buf += '<h3>significant at alpha=%s?</h3>\n' % alpha buf += T._repr_html_() n_head = len([x for x in sdx_g if x == 'head']) n_thorax = len([x for x in sdx_g if x == 'thorax']) p_value = P['head']['thorax'] return { 'fig': fig_cum, 'ax': ax_cum, 'n_head': n_head, 'n_thorax': n_thorax, 'p_value': p_value, 'buf': buf, }
#kmf.fit(latency_arr, censorship=C, label=name_key ) for i in range(len(C)): sdx_e.append(latency_arr[i]) sdx_g.append(name_key) sdx_c.append(C[i]) # --- stats -------------------------------- sdx_e = np.array(sdx_e) sdx_g = np.array(sdx_g) sdx_c = np.array(sdx_c) alpha = 0.95 try: S, P, T = pairwise_logrank_test(sdx_e, sdx_g, sdx_c, alpha=alpha) except np.linalg.linalg.LinAlgError: buf += 'numerical errors computing logrank test' else: buf += '<h2>pairwise logrank test</h2>\n' buf += ' analyses done using the <a href="http://lifelines.readthedocs.org">lifelines</a> library\n' buf += P._repr_html_() buf += '<h3>significant at alpha=%s?</h3>\n' % alpha buf += T._repr_html_() ''' clipped = df_all.copy() clipped['latency'] = clipped['latency'].clip(upper=MAX_LATENCY) group_info = label_homogeneous_groups_pandas( clipped, groupby_column_name='name_key', value_column_name='latency')
def test_pairwise_waltons_dataset_is_significantly_different(): waltons_dataset = load_waltons() R = stats.pairwise_logrank_test(waltons_dataset["T"], waltons_dataset["group"]) assert R.values[0, 1].p_value < 0.05
kmf.fit(T[ix], E[ix], label=each) if i == 0: ax = kmf.plot(ci_show=False) else: ax = kmf.plot(ax=ax, ci_show=False) ax.set_title(r"Survival Curves for Different Subjectwise") # Log rank Test for differences in survival curves from lifelines import statistics df_survival_test = pandas.DataFrame({ 'durations': T, 'events': E, 'groups': groups }) result = statistics.pairwise_logrank_test(df_survival_test['durations'], df_survival_test['groups'], df_survival_test['events']) result.test_statistic result.p_value result.print_summary() # Given that math and science are significantly different, we need to see if there is any difference in # their features of teachers of math and science from EL survey data. # Codes: # teacher_gender: male - 1, female - 2 # edu_qual: Phd, Mphil, DoublePG, PG, Grad, Sec, HrSec # qual_bechalore: Indian_languages, eng, math, sci, soc_stud, other, NA # qual_master: Indian_languages, eng, math, sci, soc_stud, other, NA # teacher_profqual: MEd, BEd, DEd, other, no_prof_qual # teacher_sub: indian_languages, eng, math, sci, soc_stud, other, NA
def test_pairwise_waltons_dataset_is_significantly_different(): waltons_dataset = load_waltons() R = stats.pairwise_logrank_test(waltons_dataset['T'], waltons_dataset['group']) assert R.values[0, 1].p_value < 0.05
def KM_all_vars(features, outcomes): T = outcomes['duration_mortality'] E = outcomes['event_mortality'] is_binary = features.columns[['ja' in f for f in features.columns]] is_binary = is_binary.append( features.columns[['positief' in f for f in features.columns]]) is_binary = is_binary.append(features.columns[[ 'Andere infectieuze ademhalingsdiagnose' in f for f in features.columns ]]) is_binary = is_binary.append(features.columns[[ 'oxygen_saturation_on_cat_' in f for f in features.columns ]]) is_float = [ 'ALAT (U/L)', 'ASAT (U/L)', 'Ureum (mmol/L)', 'Albumine (g/L)', 'Ca (totaal) (mmol/L)', 'Creatinine (µmol/L)', 'glucose (mmol/L)', 'Hb (mmol/L)', 'hart frequentie (1/min)', 'LDH (U/L)', 'Lactaat (mmol/L)', 'Lymfocyten (x10^9/L)', 'Neutrofielen (x10^9/L)', 'pCO2 (kPa)', 'Zuurgraad (pH)', 'PaO2 arteriëel (kPa)', 'trombocyten (x10^9/L)', 'Kalium (mmol/L)', 'SaO2 (%)', 'Natrium (mmol/L)', 'Temperatuur (ºC)', 'totaal Bilirubine (IE)', 'Leukocyten (x10^3/µL)', 'CK (U/L)', 'CRP (mg/L)', 'Tijd sinds eerste klachten (dagen)', 'Diastolische bloeddruk (mmHg)', 'ferritine (mg/L)', 'FiO2 (%)', 'Zuurstof saturatie (%)', 'Ademhalingsfrequentie (1/min)', 'Systolische bloeddruk (mmHg)', 'Aantal thuismedicamenten' ] is_categorical_corads = [ 'corads_admission_cat_1', 'corads_admission_cat_2', 'corads_admission_cat_3', 'corads_admission_cat_4', 'corads_admission_cat_5' ] is_categorical_male_female = ['gender_cat_1', 'gender_cat_2'] ngroups = 3 statres = [] for f in features.columns: print(f) kmfs = None fig, axes = plt.subplots(1, 1) features[f] = features[f].astype(float) if f in is_binary: yes = features[f] == 1 if sum(yes) > 0: kmf_yes = KaplanMeierFitter().fit(T[yes], E[yes], label='Ja (n={})'.format( np.nansum(yes))) no = features[f] == 0 if sum(no) > 0: kmf_no = KaplanMeierFitter().fit(T[no], E[no], label='Nee (n={})'.format( np.nansum(no))) na = features[f].isna() if sum(na) > 0: kmf_na = KaplanMeierFitter().fit( T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na))) kmfs = [kmf_yes, kmf_no, kmf_na] # event_durations (iterable) – a (n,) list-like representing the (possibly partial) durations of all individuals # groups (iterable) – a (n,) list-like of unique group labels for each individual. # event_observed (iterable, optional) – a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed. # t_0 (float, optional (default=-1)) – the period under observation, -1 for all time. groups = features[f].astype("category") event_durations = T event_observed = E elif f in is_float: ff = pd.qcut(features[f], ngroups, labels=np.arange(ngroups) + 1) kmfs = [] for q in ff.unique(): sel = ff == q if sum(sel) > 0: kmf_q = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format( q, np.nansum(sel))) kmfs += [kmf_q] sel = ff.isna() if sum(sel) > 0: q = 'Onbekend' kmf_q = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format( q, np.nansum(sel))) kmfs += [kmf_q] groups = ff event_durations = T event_observed = E elif f in is_categorical_corads: kmfs = [] for f in is_categorical_corads: sel = features[f] kmf_f = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format( f, np.nansum(sel))) kmfs += [kmf_f] na = features[is_categorical_corads].any(axis=1) == False if sum(na) > 0: kmf_na = KaplanMeierFitter().fit( T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na))) groups = is_categorical_corads #.astype("category") event_durations = T event_observed = E statres += [None] # FIXME continue # FIXME elif f in is_categorical_male_female: kmfs = [] for f in is_categorical_male_female: sel = features[f] == 1.0 kmf_f = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format( f, np.nansum(sel))) kmfs += [kmf_f] na = features[is_categorical_male_female].any(axis=1) == False if sum(na) > 0: kmf_na = KaplanMeierFitter().fit( T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na))) groups = is_categorical_male_female #.astype("category") event_durations = T event_observed = E statres += [None] # FIXME continue # FIXME else: # split data in X groups print('{} not implemented'.format(f)) groups = groups.cat.add_categories(-1).fillna( -1) # treat nan as seperate group. if len(np.unique(groups)) < 5: ss = [ pairwise_logrank_test(event_durations, groups, event_observed, t_0=21.) ] p = np.min([s.p_value for s in ss]) print(p) if p * 289 < 0.05: print('{} < 0.05 (p: , corrected p: {})'.format(f, p, p * 289)) statres += ss else: print('error in groups: {} - {}'.format(f, groups)) if kmfs: [k.plot_survival_function() for k in kmfs] axes.set_xticks([1, 5, 9, 13, 17, 21]) axes.set_xticklabels(['1', '5', '9', '13', '17', '21']) axes.set_xlabel('Aantal dagen sinds opnamedag') axes.set_ylabel('Proportie overlevend') axes.set_xlim(0, 21) axes.set_ylim(0, 1) plt.title('Kaplan-Meier survival - {}'.format(f)) plt.tight_layout() filename = clean_filename('KM_{}.png'.format(f)) plt.show() fig.savefig(os.path.join('km_curves', filename), format='png', dpi=300, figsize=(20, 20), pad_inches=0, bbox_inches='tight') return statres
def generate_pdf(bioassay_name_pattern): # Mod by Aubrey Moore 2019-04-17. # The following sql was modified to solve a problem. # The modified sql does a selects records where the full bioassay name is matcched. # Previously, a search for 'PNG' matched [PNG-1, PNG-2, PNG-3, PNGperOS-1, PNGperOS-2, PNGperOS-3] # Now, a search for 'PNG' matches only [PNG-1, PNG-2, PNG-3]. #sql = "SELECT * FROM btl WHERE bioassay_name LIKE '%{}%'".format(bioassay_name) bioassay_name = bioassay_name_pattern.replace('%', '') sql = "SELECT * FROM btl WHERE bioassay_name LIKE '{}'".format( bioassay_name_pattern) rows = db.executesql(sql, as_dict=True) df = pd.DataFrame(rows) print df.info() t, e = lifelines.utils.datetimes_to_durations( start_times=pd.to_datetime(df.date_start_bioassay), end_times=pd.to_datetime(df.date_died), fill_date=pd.to_datetime(df.date_end_bioassay)) print t df['t'] = t df['e'] = e # Create survorship plot fig, ax = plt.subplots(figsize=(18, 6)) kmf = KaplanMeierFitter() for name, grouped_df in df.groupby('bioassay_treatment'): kmf.fit(grouped_df['t'], grouped_df['e'], label=name) kmf.plot(ax=ax, linewidth=5, ci_show=False) ax.set_xlabel('days after treatment') ax.set_ylabel('proportion dead') ax.set_ylim([0, 1]) fig.savefig('survivorshipfig.pdf') # Create motality-table-tex results = statistics.pairwise_logrank_test(df['t'], df['bioassay_treatment'], df['e']) s = r''' \begin{table}[h!] \centering \caption{Pairwise differences among mortality curves.} ''' s += results.summary.to_latex() s += '\end{table}' mortality_table_tex = s print s # Create document s = r''' \documentclass[11pt]{scrartcl} \usepackage{textcomp} \usepackage{gensymb} \usepackage{graphicx} \usepackage{grffile} %required because there are multiple dot characters in my file names \usepackage{booktabs} \usepackage[letterpaper, margin=1in]{geometry} \titlehead{\centering\includegraphics[width=0.75in]{applications/rearing/static/images/crb_logo.png}\\ University of Guam Coconut Rhinoceros Beetle Biological Control Project\\ Bioassay Report generated by CRB Rearing Database v.20190317\\ https://aubreymoore.pythonanywhere.com/rearing} \title{---title---} \author{Aubrey Moore and Jim Grasela\\University of Guam Coconut Rhinoceros Beetle Biocontrol Project} \begin{document} \begin{titlepage} \maketitle \tableofcontents \end{titlepage} ---description--- \clearpage \section{Mortality} \includegraphics[width=\textwidth]{survivorshipfig.pdf} ---mortality-table-tex--- \clearpage \section{Mass} ---mass plots--- \clearpage \section{Postmortem Images} ---postmortem images--- \end{document} ''' s = s.replace('---title---', tex_escape(bioassay_name)) s = s.replace('---mortality-table-tex---', mortality_table_tex) # Replace ---description--- with tex from database sql = 'select tex from bioassay where name="{}"'.format( bioassay_name_pattern) logger.debug(sql) rows = db.executesql(sql) if rows: logger.debug(rows[0]) description = rows[0][0] description = description.encode('utf8') logger.debug('type: {} description: {}'.format( type(description), description)) s = s.replace('---description---', description) else: s = s.replace('---description---', '') #Replace ---mass plots--- file_list = plot_mass_by_treatment(bioassay_name_pattern) logger.debug('file_list: {}'.format(file_list)) tex = '' for file_name in file_list: tex += r'\subsection*{' + file_name.replace('-', ' ') + r'}' tex += '\n' tex += r'\includegraphics[width=\textwidth]{' + file_name + r'}' tex += '\n' s = s.replace('---mass plots---', tex) if False: #Replace ---postmortem images--- tex = get_postmortem_images(bioassay_name_pattern) s = s.replace('---postmortem images---', tex) #s = tex_escape(s) logger.debug(s) # Generate PDF with open('report.tex', "w") as f: f.write(s) result = subprocess.call(['pdflatex', 'report.tex']) result = subprocess.call(['pdflatex', 'report.tex']) return response.stream('report.pdf')
plt.rcParams['figure.figsize'] = [12, 5] kmf.plot(); print(dfCurvas.sexo.value_counts()) curvaSobrevivencia(dfCurvas,'sexo') results=multivariate_logrank_test(event_durations=T,groups=dfCurvas.sexo,event_observed=C) results.print_summary() results=pairwise_logrank_test(event_durations=T,groups=dfCurvas.sexo,event_observed=C) results.print_summary() dfCurvas.mesesUP.describe() var='mesesUP' varEscalao='escMesesUP' dfCurvas[varEscalao]='' for index, cliente in dfCurvas.iterrows(): #se a variável tiver o valor 1 colocar na nova variável a descrição da atividade if cliente[var] <= 2: dfCurvas.at[index,varEscalao]=var+' less than 2' elif (cliente[var] > 2) & (cliente[var] <= 4): dfCurvas.at[index,varEscalao]=var+' greather than 2 and less 4'
def test_pairwise_waltons_dataset_is_significantly_different(): waltons_dataset = load_waltons() R = stats.pairwise_logrank_test(waltons_dataset["T"], waltons_dataset["group"]) assert R.summary.loc[("control", "miR-137")]["p"] < 0.05
def do_cum_incidence(gtdf,label): # Calculate cumulative wing extension since last laser on. # (This is ugly...) gtdf['cum_wei_this_trial'] = 0 gtdf['head_trial'] = 0 gtdf['thorax_trial'] = 0 obj_ids = gtdf['obj_id'].unique() for obj_id in obj_ids: prev_laser_state = 0 cur_cum_this_trial = 0 cur_head_trial = 0 cur_thorax_trial = 0 cur_ttm = 0 cur_state = 'none' prev_t = -np.inf for rowi in range(len(gtdf)): row = gtdf.iloc[rowi] if row['obj_id'] != obj_id: continue assert row['t'] > prev_t prev_t = row['t'] if prev_laser_state == 0 and row['laser_state']: # new laser pulse, reset cum cur_cum_this_trial = 0 if row['ttm'] < 0: cur_head_trial += 1 cur_state = 'head' elif row['ttm'] > 0: cur_thorax_trial += 1 cur_state = 'thorax' prev_laser_state = row['laser_state'] if row['zx'] > 0: cur_cum_this_trial = 1 #gtdf['cum_wei_this_trial'] = cur_cum_this_trial row['cum_wei_this_trial'] = cur_cum_this_trial if cur_state=='head': row['head_trial'] = cur_head_trial elif cur_state=='thorax': row['thorax_trial'] = cur_thorax_trial gtdf.iloc[rowi] = row fig_cum = plt.figure('cum indx: %s'%label,figsize=(6,5)) ax_cum = fig_cum.add_subplot(111) pulse_nums = [1,2,3] this_data = {} head_data = [] thorax_data = [] sdx_e = [] sdx_g = [] sdx_c = [] for pulse_num in pulse_nums: head_pulse_df = gtdf[ gtdf['head_trial']==pulse_num ] thorax_pulse_df = gtdf[ gtdf['thorax_trial']==pulse_num ] h = _do_group( head_pulse_df ) t = _do_group( thorax_pulse_df ) sdx_e, sdx_g, sdx_c = _update_latencies( head_pulse_df,sdx_e, sdx_g, sdx_c,'head') sdx_e, sdx_g, sdx_c = _update_latencies( thorax_pulse_df,sdx_e, sdx_g, sdx_c,'thorax') this_data['head%d'%pulse_num] = h this_data['thorax%d'%pulse_num] = t head_data.append( h ) thorax_data.append( t ) plot_cum( ax_cum, h, #label='head %d'%pulse_num, lw=0.5, color=COLORS[HEAD]) plot_cum( ax_cum, t, #label='thorax %d'%pulse_num, lw=0.5, color=COLORS[THORAX]) all_head_data = combine_cum_data( head_data ) all_thorax_data = combine_cum_data( thorax_data ) plot_cum( ax_cum, all_head_data, label='head', lw=2, color=COLORS[HEAD]) plot_cum( ax_cum, all_thorax_data, label='thorax', lw=2, color=COLORS[THORAX]) ax_cum.legend() sdx_e = np.array(sdx_e) sdx_g = np.array(sdx_g) sdx_c = np.array(sdx_c) buf = '' alpha = 0.95 try: S,P,T = pairwise_logrank_test( sdx_e, sdx_g, sdx_c, alpha=alpha ) except np.linalg.linalg.LinAlgError: buf += 'numerical errors computing logrank test' else: buf += '<h1>%s</h1>\n'%label buf += '<h2>pairwise logrank test</h2>\n' buf += ' analyses done using the <a href="http://lifelines.readthedocs.org">lifelines</a> library\n' buf += P._repr_html_() buf += '<h3>significant at alpha=%s?</h3>\n'%alpha buf += T._repr_html_() n_head = len([x for x in sdx_g if x=='head']) n_thorax = len([x for x in sdx_g if x=='thorax']) p_value = P['head']['thorax'] return {'fig':fig_cum, 'ax':ax_cum, 'n_head':n_head, 'n_thorax':n_thorax, 'p_value':p_value, 'buf':buf, }
def plot3(df): import sys #get_ipython().system('{sys.executable} -m pip install lifelines') #install pandas and matlab plot import pandas as pd import matplotlib.pyplot as plt from lifelines import KaplanMeierFitter # import os # os.chdir("/Users/MDONEGAN/Downloads") #survival= pd.read_csv("/Users/MDONEGAN/Downloads/Book2.csv", sep=',') survival = df from lifelines.statistics import pairwise_logrank_test results = pairwise_logrank_test(survival['time'], survival['group'], survival['event']) results.print_summary() #%% # this util converts a table with "death" and "censored" (alive) into the lifelines format from lifelines import KaplanMeierFitter from lifelines.utils import survival_events_from_table kmf = KaplanMeierFitter() ax = plt.subplot(111) #df = pd.read_csv('/Users/MDONEGAN/Downloads/counts.csv') df = df.set_index('time') T, E, W = survival_events_from_table(df, observed_deaths_col='death', censored_col='censored') kmf.fit(T, E, weights=W) kmf.plot(ax=ax, ci_show=True, marker='o') plt.xlabel("days") plt.ylabel("survival %") plt.ylim(0.4, 1.05) #%% #trying to combine the grouping function and the events from table function from lifelines import KaplanMeierFitter from lifelines.utils import survival_events_from_table kmf = KaplanMeierFitter() ax = plt.subplot(111) #df = pd.read_csv('/Users/MDONEGAN/Downloads/counts.csv') df = df.set_index('time') T, E, W = survival_events_from_table(df, observed_deaths_col='death', censored_col='censored') print(E) #group dataset by treatment and plot all groups (treatments) using kmf fit for name, T_group, E_group, W_group in T, E, W.groupby('group'): kmf.fit(grouped_survival['T'], grouped_survival['E'], label=name) kmf.plot(ax=ax, ci_show=False, marker='o') plt.xlabel("days") plt.ylabel("survival %") plt.ylim(0.4, 1.05) return fig_to_uri(plt)
C = latency_arr < MAX_LATENCY #kmf.fit(latency_arr, censorship=C, label=name_key ) for i in range(len(C)): sdx_e.append( latency_arr[i] ) sdx_g.append( name_key ) sdx_c.append( C[i] ) # --- stats -------------------------------- sdx_e = np.array(sdx_e) sdx_g = np.array(sdx_g) sdx_c = np.array(sdx_c) alpha = 0.95 try: S,P,T = pairwise_logrank_test( sdx_e, sdx_g, sdx_c, alpha=alpha ) except np.linalg.linalg.LinAlgError: buf += 'numerical errors computing logrank test' else: buf += '<h2>pairwise logrank test</h2>\n' buf += ' analyses done using the <a href="http://lifelines.readthedocs.org">lifelines</a> library\n' buf += P._repr_html_() buf += '<h3>significant at alpha=%s?</h3>\n'%alpha buf += T._repr_html_() ''' clipped = df_all.copy() clipped['latency'] = clipped['latency'].clip(upper=MAX_LATENCY) group_info = label_homogeneous_groups_pandas( clipped, groupby_column_name='name_key', value_column_name='latency')