def interaction_empirical_p_resample(a, b, surv, num_perm=101, check_first=True): ''' Calculate an empirical p-value for an interaction by sampling with replacement. We first test if there is an improvement in model fit by considering the interaction of the two events. If so, we then derive an empirical p-value. ''' a, b = match_series(a, b) if fisher_exact_test(a, b)['odds_ratio'] > 1: int_direction = 'both' else: int_direction = 'neither' r = get_interaction(a, b, surv) if (r < 0) and (check_first is True): return pd.Series({'p': 1, 'interaction': int_direction}) mat = np.random.choice(a.index, size=(num_perm, len(a.index))) vec = {} for i, idx in enumerate(mat): a_p = pd.Series(list(a.ix[idx]), range(len(idx))) b_p = pd.Series(list(b.ix[idx]), range(len(idx))) surv_p = pd.DataFrame(surv.unstack().ix[a.index].as_matrix(), index=range(len(idx)), columns=['days', 'event']).stack() vec[i] = get_interaction(a_p, b_p, surv_p, int_direction) vec = pd.Series(vec) empirical_p = 1.*(len(vec) - sum(vec <= r)) / len(vec) return pd.Series({'p': empirical_p, 'interaction': int_direction})
def box_plot_pandas(bin_vec, real_vec, ax=None): """ Wrapper around matplotlib's boxplot function. Inputs bin_vec: Series of labels real_vec: Series of measurements to be grouped according to bin_vec """ _, ax = init_ax(ax) bin_vec, real_vec = match_series(bin_vec, real_vec) categories = bin_vec.value_counts().index data = [real_vec[bin_vec == num] for num in categories] bp = ax.boxplot(data, positions=range(len(categories)), widths=.3, patch_artist=True) if real_vec.name: ax.set_ylabel(real_vec.name) if bin_vec.name: ax.set_xlabel(bin_vec.name) [p.set_visible(False) for p in bp['fliers']] [p.set_visible(False) for p in bp['caps']] [p.set_visible(False) for p in bp['whiskers']] for p in bp['medians']: p.set_color(colors[0]) p.set_lw(3) p.set_alpha(.8) for i, p in enumerate(bp['boxes']): p.set_color('grey') p.set_lw(3) p.set_alpha(.7) if len(data[i]) < 3: p.set_alpha(0)
def interaction_empirical_p(a, b, surv, num_perm=101): ''' Calculate an empirical p-value for an interaction by sampling with replacement. We first test if there is an improvement in model fit by considering the interaction of the two events. If so, we then derive an empirical p-value. ''' a, b = match_series(a, b) if fisher_exact_test(a, b)['odds_ratio'] > 1: int_direction = 'both' else: int_direction = 'neither' r = get_interaction(a, b, surv) mat = np.array([np.random.permutation(a.index) for i in range(num_perm)]) vec = {} for i, idx in enumerate(mat): a_p = pd.Series(list(a.ix[idx]), range(len(idx))) b_p = pd.Series(list(b.ix[idx]), range(len(idx))) surv_p = pd.DataFrame(surv.unstack().ix[a.index].as_matrix(), index=range(len(idx)), columns=['days', 'event']).stack() vec[i] = get_interaction(a_p, b_p, surv_p, int_direction) vec = pd.Series(vec).dropna() empirical_p = 1. * (len(vec) - sum(vec <= r)) / len(vec) return pd.Series({'p': empirical_p, 'interaction': int_direction})
def pearson_p(a,b): ''' Find pearson's correlation and return p-value. ------------------------------------------------ a, b: Series with continuous measurements ''' a,b = match_series(a.dropna(), b.dropna()) _,p = pearsonr(a,b) return p
def violin_plot_pandas(bin_vec, real_vec, ann='p', order=None, ax=None, filename=None): """ http://pyinsci.blogspot.com/2009/09/violin-plot-with-matplotlib.html Wrapper around matplotlib's boxplot function to add violin profile. Inputs bin_vec: Series of labels real_vec: Series of measurements to be grouped according to bin_vec """ fig, ax = init_ax(ax) ax.set_ylabel(real_vec.name) ax.set_xlabel(bin_vec.name) bin_vec, real_vec = match_series(bin_vec, real_vec) try: if order is None: categories = bin_vec.value_counts().index else: categories = order _violin_plot(ax, [real_vec[bin_vec == num] for num in categories], pos=categories, bp=True) ax.set_xticklabels( [str(c) + '\n(n=%i)' % sum(bin_vec == c) for c in categories]) except: box_plot_pandas(bin_vec, real_vec, ax=ax) #if type(bin_vec.name) == str: # ax.set_title(str(bin_vec.name) + ' x ' + str(real_vec.name)) p_value = Stats.kruskal_pandas(bin_vec, real_vec)['p'] if ann == 'p_fancy': ax.annotate('$p = {}$'.format(latex_float(p_value)), (.95, -.02), xycoords='axes fraction', ha='right', va='bottom', size=14) if ann == 'p': ax.annotate('p = {0:.1e}'.format(p_value), (.95, .02), xycoords='axes fraction', ha='right', va='bottom', size=12) elif ann is not None: ax.annotate(ann, (.95, .02), xycoords='axes fraction', ha='right', va='bottom', size=12) if filename is not None: fig.savefig(filename) return
def kruskal_p(hit_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ hit_vec: Series of labels response_vec: Series of measurements ''' try: hit_vec, response_vec = match_series(hit_vec, response_vec) return kruskal(*[response_vec[hit_vec == num] for num in hit_vec.unique()])[1] except: return nan
def bartlett_pandas(group_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ group_vec: Series of labels response_vec: Series of measurements ''' if group_vec.value_counts().min() < min_size: return nan group_vec, response_vec = match_series(group_vec, response_vec) res = bartlett(*[response_vec[group_vec == num] for num in group_vec.unique()]) return pd.Series(res, index=['T','p'])
def pearson_pandas(a, b, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ hit_vec: Series of labels response_vec: Series of measurements ''' try: a, b = match_series(a, b) res = stats.pearsonr(a,b) return pd.Series(res, index=['rho','p']) except: return pd.Series(index=['rho','p'])
def anova(hit_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ hit_vec: Series of labels response_vec: Series of measurements ''' if hit_vec.value_counts().min < min_size: return nan hit_vec, response_vec = match_series(hit_vec, response_vec) res = f_oneway(*[response_vec[hit_vec == num] for num in hit_vec.unique()]) return pd.Series(res, index=['F','p'])
def kruskal_pandas(hit_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ hit_vec: Series of labels response_vec: Series of measurements ''' try: hit_vec, response_vec = match_series(hit_vec, response_vec) res = kruskal(*[response_vec[hit_vec == num] for num in hit_vec.unique()]) return pd.Series(res, index=['H','p']) except: return pd.Series(index=['H','p'])
def single_gene_section(cancer, hit_matrix, cutoff=.25): #Format data for report path = cancer.report_folder + '/' gene_table_file = path + 'gene_table.csv' hit_matrix = hit_matrix.groupby(level=0).first() #Make index unique counts = (hit_matrix.ix[:,cancer.patients] > 0).sum(1) counts.name = 'n_patients' genes = Series(dict((i,i) for i in cancer.q_genes.index), name='gene') gene_table = cancer.q_genes.join(counts).join(genes) gene_table = gene_table.ix[:,::-1] if 'survival' in gene_table: gene_table = gene_table.sort(columns='survival') gene_table.to_csv(gene_table_file) genes_to_show = cancer.q_genes[(cancer.q_genes < .2).sum(1) > 0].index gene_table = gene_table.ix[genes_to_show] if 'survival' in gene_table: gene_table = gene_table.sort(columns='survival') gene_table = gene_table.head(20) gene_table_r = com.convert_to_r_dataframe(gene_table) #@UndefinedVariable if len(gene_table) == 0: return nz.addTo(nz.newSubSection('Gene Mutations'), nz.newParagraph('')) #Overview tableCaption1 = "Association of gene mutations with patient clinical features." table1 = nz.newTable(gene_table_r, tableCaption1, file=gene_table_file, significantDigits=2); #Fill in the details gene_pos = dict((g,i+1) for i,g in enumerate(gene_table.index)) col_pos = dict((c,i+1) for i,c in enumerate(gene_table.columns)) #age violin plots if 'age' in gene_table: for g,val in gene_table['age'].iteritems(): num_genes = (match_series(hit_matrix.ix[g], cancer.clinical.age)[0] > 0).sum() if val < cutoff and num_genes > 2: table1 = add_violin_plot(hit_matrix.ix[g], cancer, table1, (gene_pos[g], col_pos['age']), path + FIG_EXT) #survival curves if 'survival' in gene_table: for g,val in gene_table['survival'].iteritems(): if val < cutoff: table1 = add_survival_curve(hit_matrix.ix[g], cancer, table1, (gene_pos[g], col_pos['survival']), path + FIG_EXT) section = nz.addTo(nz.newSubSection('Gene Mutations'), table1) return section
def series_scatter(s1, s2, ax=None, ann='p', filename=None, **plot_args): fig, ax = init_ax(ax, figsize=(6,4)) if 's' not in plot_args: plot_args['s'] = 75 if 'alpha' not in plot_args: plot_args['alpha'] = .5 ax.scatter(*match_series(s1, s2), **plot_args) ax.set_xlabel(s1.name) ax.set_ylabel(s2.name) if ann == 'p': ax.annotate('p = {0:.1e}'.format(Tests.spearman_pandas(s1, s2)['p']), (.95, -.02), xycoords='axes fraction', ha='right',va='bottom', size=14) if ann == 'fancy_p': ax.annotate('$p = {}$'.format(latex_float(Tests.spearman_pandas(s1, s2)['p'])), (.95, -.02), xycoords='axes fraction', ha='right',va='bottom', size=14) if filename is not None: fig.savefig(filename)
def series_scatter(s1, s2, ax=None, ann='p', filename=None, **plot_args): fig, ax = init_ax(ax, figsize=(6, 4)) if 's' not in plot_args: plot_args['s'] = 75 if 'alpha' not in plot_args: plot_args['alpha'] = .5 ax.scatter(*match_series(s1, s2), **plot_args) ax.set_xlabel(s1.name) ax.set_ylabel(s2.name) if ann == 'p': ax.annotate('p = {0:.1e}'.format(Tests.spearman_pandas(s1, s2)['p']), (.95, -.02), xycoords='axes fraction', ha='right', va='bottom', size=14) if ann == 'fancy_p': ax.annotate('$p = {}$'.format(latex_float(Tests.spearman_pandas(s1, s2)['p'])), (.95, -.02), xycoords='axes fraction', ha='right', va='bottom', size=14) if filename is not None: fig.savefig(filename)
def pathway_mutation_section(cancer, gene_sets, cutoff=.25): #Format data for report path = cancer.report_folder + '/' pathway_table_file = path + 'pathway_table.csv' pathway_table = format_pathway_table(cancer, gene_sets) if 'survival' in pathway_table: pathway_table = pathway_table.sort(columns='survival') pathway_table.to_csv(pathway_table_file) keepers = cancer.q_pathways[(cancer.q_pathways < .25).sum(1) > 0].index pathway_table = pathway_table.ix[keepers] if 'survival' in pathway_table: pathway_table = pathway_table.sort(columns='survival') pathway_table = pathway_table.head(20) pathway_table_r = com.convert_to_r_dataframe(pathway_table.replace(nan, 1.23)) #@UndefinedVariable if len(pathway_table) == 0: return nz.addTo(nz.newSubSection('Pathway Mutations'), nz.newParagraph('')) #Overview tableCaption1 = ('Association of pathway level mutations with patient' + 'clinical features.') table1 = nz.newTable(pathway_table_r, tableCaption1, file=pathway_table_file, significantDigits=2); #Fill in the details pathway_pos = dict((p,i+1) for i,p in enumerate(pathway_table.index)) col_pos = dict((c,i+1) for i,c in enumerate(pathway_table.columns)) #age violin plots if 'age' in pathway_table: for g,val in pathway_table['age'].iteritems(): num_patients = (match_series(cancer.meta_matrix.ix[g], cancer.clinical.age)[0] > 0).sum() if val < cutoff and num_patients > 2: table1 = add_violin_plot(cancer.meta_matrix.ix[g], cancer, table1, (pathway_pos[g], col_pos['age']), path + FIG_EXT) #survival curves if 'survival' in pathway_table: for g,val in pathway_table['survival'].iteritems(): if val < cutoff: table1 = add_survival_curve_pathway(cancer.meta_matrix.ix[g], cancer, table1, (pathway_pos[g], col_pos['survival']), path + FIG_EXT) section = nz.addTo(nz.newSubSection('Pathway Mutations'), table1) return section
def violin_plot_pandas(bin_vec, real_vec, ann='p', order=None, ax=None, filename=None): """ http://pyinsci.blogspot.com/2009/09/violin-plot-with-matplotlib.html Wrapper around matplotlib's boxplot function to add violin profile. Inputs bin_vec: Series of labels real_vec: Series of measurements to be grouped according to bin_vec """ fig, ax = init_ax(ax) ax.set_ylabel(real_vec.name) ax.set_xlabel(bin_vec.name) bin_vec, real_vec = match_series(bin_vec, real_vec) try: if order is None: categories = bin_vec.value_counts().index else: categories = order _violin_plot(ax, [real_vec[bin_vec == num] for num in categories], pos=categories, bp=True) ax.set_xticklabels([str(c) + '\n(n=%i)' % sum(bin_vec == c) for c in categories]) except: box_plot_pandas(bin_vec, real_vec, ax=ax) #if type(bin_vec.name) == str: # ax.set_title(str(bin_vec.name) + ' x ' + str(real_vec.name)) p_value = Stats.kruskal_pandas(bin_vec, real_vec)['p'] if ann == 'p_fancy': ax.annotate('$p = {}$'.format(latex_float(p_value)), (.95, -.02), xycoords='axes fraction', ha='right', va='bottom', size=14) if ann == 'p': ax.annotate('p = {0:.1e}'.format(p_value), (.95, .02), xycoords='axes fraction', ha='right', va='bottom', size=12) elif ann is not None: ax.annotate(ann, (.95, .02), xycoords='axes fraction', ha='right', va='bottom', size=12) if filename is not None: fig.savefig(filename) return
def test(hit_vec): hit_vec, response_vec = match_series(hit_vec, self.response_vec) res = f_oneway(*[response_vec[hit_vec == num] for num in hit_vec.unique()]) return Series({'stat': res[0], 'p': res[1]})