plt.xlabel('TCGA cancer type')
plt.ylabel('cindex(signal) - cindex(shuffled)')
plt.title('Survival prediction, {}'.format(me_results_desc))
plt.ylim(-0.5, 0.5)

plt.tight_layout()

# ### Heatmap
#
# This is similar to the heatmaps we plotted in the results script in `02_classify_mutations` for the mutation prediction problem. We want to compare data types for predicting survival in different cancer types.

# In[10]:

me_all_results_df = au.compare_all_data_types(
    me_results_df[~me_results_df.cancer_type.isin(drop_cancer_types)],
    SIG_ALPHA,
    identifier='cancer_type',
    metric='cindex')

me_all_results_df.rename(columns={'gene': 'cancer_type'}, inplace=True)
me_all_results_df.sort_values(by='p_value').head(10)

# In[11]:

me_heatmap_df = (me_all_results_df.pivot(
    index='training_data', columns='cancer_type',
    values='delta_mean').reindex(sorted(me_compare_df.training_data.unique())))
me_heatmap_df.iloc[:, :5]

# In[12]:
Beispiel #2
0
hue_order = ['Control', 'Drop target', 'Only target']
sns.boxplot(data=plot_df,
            x='identifier',
            y='aupr',
            hue='experiment',
            hue_order=hue_order)

# ### Correct for shuffled baseline + plot signal AUPR - shuffled AUPR

# In[5]:

# now correct for shuffled baseline
results_df = (results_df.drop(columns='training_data').rename(
    columns={'experiment': 'training_data'}))
all_results_df = au.compare_all_data_types(results_df,
                                           0.05,
                                           filter_genes=False,
                                           compare_ind=True)
all_results_df = all_results_df.rename(columns={'training_data': 'experiment'})
all_results_df.head()

# In[6]:

sns.set({'figure.figsize': (13, 6)})
sns.set_style('whitegrid')
sns.set_palette('Set2')

sns.boxplot(data=all_results_df,
            x='gene',
            y='delta_aupr',
            hue='experiment',
            hue_order=hue_order)
Beispiel #3
0
        if _y > 0:
            ax.text(_x, _y + 0.01, val, ha="center")
        else:
            ax.text(_x, _y - 0.02, val, ha="center")


show_values_on_bars(plt.gca())
plt.gca().get_xaxis().set_visible(False)
plt.ylabel('{} difference'.format(metric.upper()))
plt.title('Performance difference between new and old shuffling scheme',
          size=14)

# In[8]:

old_all_results_df = au.compare_all_data_types(old_results_df,
                                               0.05,
                                               metric='aupr')
old_all_results_df.head()

# In[9]:

new_all_results_df = au.compare_all_data_types(new_results_df,
                                               0.05,
                                               metric='aupr')
new_all_results_df.head()

# In[10]:

compare_results_df = new_all_results_df.copy()
compare_results_df[
    'mean_diff'] = new_all_results_df.delta_mean - old_all_results_df.delta_mean
Beispiel #4
0
sns.boxplot(data=plot_df, x='identifier', y='auroc', hue='training_data', ax=axarr[1])
axarr[1].set_title('MSI prediction performance, AUROC')
axarr[1].set_xlabel('Cancer type')
axarr[1].set_ylabel('AUROC')
axarr[1].set_ylim(0.35, 1.05)
axarr[1].legend(title='Data type')


# ### Plot results per cancer type, corrected for permuted labels baseline

# In[5]:


compare_df = au.compare_all_data_types(results_df,
                                       sig_alpha=0.05,
                                       filter_genes=False,
                                       compare_ind=True,
                                       metric='auroc')
compare_df.rename(columns={'gene': 'cancer_type'}, inplace=True)

compare_aupr_df = au.compare_all_data_types(results_df,
                                            sig_alpha=0.05,
                                            filter_genes=False,
                                            compare_ind=True,
                                            metric='aupr')
compare_aupr_df.rename(columns={'gene': 'cancer_type'}, inplace=True)

print(compare_df.shape)
print(compare_df.training_data.unique())
compare_df.head()
Beispiel #5
0
print(results_df.shape)
training_data_map = {
    'expression': 'gene expression',
    'me_27k': '27k methylation',
    'me_450k': '450k methylation',
    'rppa': 'RPPA',
    'mirna': 'microRNA',
    'mut_sigs': 'mutational signatures',
}
results_df.training_data.replace(to_replace=training_data_map, inplace=True)
results_df.head()

# In[6]:

all_results_df = au.compare_all_data_types(results_df,
                                           SIG_ALPHA,
                                           metric=plot_metric)

cfg.sig_genes_dir.mkdir(exist_ok=True)
all_results_df.to_csv(cfg.sig_genes_all, index=False, sep='\t')

all_results_df.sort_values(by='p_value').head(10)

# In[7]:

sns.set({'figure.figsize': (22, 5)})
sns.set_style('whitegrid')

fig, axarr = plt.subplots(1, 3)

# plot mutation prediction from expression, in a volcano-like plot
Beispiel #6
0
# and that we have data for two replicates (two random seeds)
print(random_50_df.shape)
print(random_50_df.seed.unique())
print(random_50_df.training_data.unique())
random_50_df.head()

# In[6]:

# combine results dataframes
results_df = (pd.concat(
    (vogelstein_df, top_50_df,
     random_50_df)).drop(columns=['training_data', 'experiment']).rename(
         columns={'gene_set': 'training_data'}))

all_results_df = au.compare_all_data_types(results_df,
                                           SIG_ALPHA,
                                           filter_genes=False,
                                           metric=plot_metric)

all_results_df['nlog10_p'] = -np.log10(all_results_df.corr_pval)
all_results_df.sort_values(by='p_value').head(10)

# In[7]:

sns.set({'figure.figsize': (24, 6)})
sns.set_style('whitegrid')
fig, axarr = plt.subplots(1, 3)

gene_set_map = {
    'random_50': 'random',
    'top_50': 'most mutated',
    'vogelstein': 'Vogelstein et al.'
Beispiel #7
0
        adjust_text(text_labels,
                    ax=ax, 
                    expand_text=(1., 1.),
                    lim=5)

plt.suptitle('Mutation prediction, raw vs. compressed results', size=16)
plt.tight_layout(w_pad=2, h_pad=2)
plt.subplots_adjust(top=0.94)


# In[6]:


raw_compare_df = au.compare_all_data_types(raw_results_df,
                                           SIG_ALPHA,
                                           filter_genes=False,
                                           compare_ind=True,
                                           metric=plot_metric)

raw_compare_df.sort_values(by=['training_data'], inplace=True)
raw_compare_df.head(5)


# In[7]:


raw_compare_all_df = au.compare_all_data_types(raw_results_df,
                                               SIG_ALPHA,
                                               filter_genes=False,
                                               metric=plot_metric)