コード例 #1
0
def dead_keio_stats(features, metadata, args):
    """ Perform statistical analyses on dead Keio experiment results:
        - t-tests for each feature comparing each strain vs control for paired antioxidant treatment conditions
        - t-tests for each feature comparing each strain antioxidant treatment to negative control (no antioxidant)
        
        Inputs
        ------
        features, metadata : pd.DataFrame
            Clean feature summaries and accompanying metadata
        
        args : Object
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - control_dict : dict
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int           
    """

    print("\nInvestigating variation in worm behaviour on dead vs alive hit Keio strains")  

    # assert there will be no errors due to case-sensitivity
    assert len(metadata[STRAIN_COLNAME].unique()) == len(metadata[STRAIN_COLNAME].str.upper().unique())
    assert all(type(b) == np.bool_ for b in metadata[TREATMENT_COLNAME].unique())
    
    # Load Tierpsy feature set + subset (columns) for selected features only
    features = select_feat_set(features, 'tierpsy_{}'.format(args.n_top_feats), append_bluelight=True)
    features = features[[f for f in features.columns if 'path_curvature' not in f]]
    
    assert not features.isna().any().any()
    #n_feats = features.shape[1]
    
    strain_list = list(metadata[STRAIN_COLNAME].unique())
    assert CONTROL_STRAIN in strain_list

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME])
    print("Mean sample size of %s: %d" % (STRAIN_COLNAME, int(sample_size['n_samples'].mean())))
    
    # construct save paths (args.save_dir / topfeats? etc)
    save_dir = get_save_dir(args)
    stats_dir =  save_dir / "Stats" / args.fdr_method 
    
    ##### ANOVA #####

    # make path to save ANOVA results
    test_path = stats_dir / 'ANOVA_results.csv'
    test_path.parent.mkdir(exist_ok=True, parents=True)

    # ANOVA across strains for significant feature differences
    if len(metadata[STRAIN_COLNAME].unique()) > 2:   
        stats, pvals, reject = univariate_tests(X=features, 
                                                y=metadata[STRAIN_COLNAME], 
                                                test='ANOVA',
                                                control=CONTROL_STRAIN,
                                                comparison_type='multiclass',
                                                multitest_correction=None, # uncorrected
                                                alpha=args.pval_threshold,
                                                n_permutation_test=None) # 'all'
    
        # get effect sizes
        effect_sizes = get_effect_sizes(X=features, 
                                        y=metadata[STRAIN_COLNAME], 
                                        control=CONTROL_STRAIN,
                                        effect_type=None,
                                        linked_test='ANOVA')
    
        # correct for multiple comparisons
        reject_corrected, pvals_corrected = _multitest_correct(pvals, 
                                                               multitest_method=args.fdr_method,
                                                               fdr=args.pval_threshold)
                                    
        # compile + save results (corrected)
        test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1)
        test_results.columns = ['stats','effect_size','pvals','reject']     
        test_results['significance'] = sig_asterix(test_results['pvals'])
        test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals
        test_results.to_csv(test_path, header=True, index=True)
        
        nsig = test_results['reject'].sum()
        print("%d features (%.f%%) signficantly different among '%s'" % (nsig, 
              len(test_results.index)/nsig, STRAIN_COLNAME))

    
    ##### t-tests #####

    for strain in strain_list:                                   
        strain_meta = metadata[metadata[STRAIN_COLNAME]==strain]
        strain_feat = features.reindex(strain_meta.index)
                     
        ### t-tests for each feature comparing live vs dead behaviour
    
        ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format((t_test + '_' + strain))
        ttest_path = stats_dir / '{}_results.csv'.format((t_test + '_' + strain))  
        ttest_path.parent.mkdir(exist_ok=True, parents=True)

        # perform t-tests (without correction for multiple testing)
        stats_t, pvals_t, reject_t = univariate_tests(X=strain_feat, 
                                                      y=strain_meta[TREATMENT_COLNAME], 
                                                      control=CONTROL_TREATMENT, 
                                                      test=t_test,
                                                      comparison_type='binary_each_group',
                                                      multitest_correction=None, 
                                                      alpha=0.05)
        # get effect sizes for comparisons
        effect_sizes_t =  get_effect_sizes(X=strain_feat, 
                                           y=strain_meta[TREATMENT_COLNAME], 
                                           control=CONTROL_TREATMENT,
                                           effect_type=None,
                                           linked_test=t_test)
        
        # compile + save t-test results (uncorrected)
        stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
        pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
        reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
        effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
        ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
        ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
        
        # correct for multiple comparisons
        pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
        reject_t, pvals_t = _multitest_correct(pvals_t, 
                                               multitest_method=args.fdr_method,
                                               fdr=args.pval_threshold)

        # compile + save t-test results (corrected)
        pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
        reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
        ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
        ttest_corrected.to_csv(ttest_path, header=True, index=True)

        # record t-test significant features (not ordered)
        fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
        #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
        print("%d significant features for %s on any %s vs %s (%s, %s, P<%.2f)" % (len(fset_ttest),
              strain, TREATMENT_COLNAME, CONTROL_TREATMENT, t_test, args.fdr_method, args.pval_threshold))

        if len(fset_ttest) > 0:
            ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format((t_test + '_' + strain))
            write_list_to_file(fset_ttest, ttest_sigfeats_path)

    ##### for LIVE bacteria: compare each strain with control #####
    
    live_metadata = metadata[metadata['dead']==False]
    live_features = features.reindex(live_metadata.index)    

    ttest_path_uncorrected = stats_dir / '{}_live_uncorrected.csv'.format(t_test)
    ttest_path = stats_dir / '{}_live_results.csv'.format(t_test)
    ttest_path.parent.mkdir(exist_ok=True, parents=True)
    
    # perform t-tests (without correction for multiple testing)   
    stats_t, pvals_t, reject_t = univariate_tests(X=live_features, 
                                                  y=live_metadata[STRAIN_COLNAME], 
                                                  control=CONTROL_STRAIN, 
                                                  test=t_test,
                                                  comparison_type='binary_each_group',
                                                  multitest_correction=None, 
                                                  alpha=0.05)
    
    # get effect sizes for comparisons
    effect_sizes_t =  get_effect_sizes(X=live_features, 
                                       y=live_metadata[STRAIN_COLNAME], 
                                       control=CONTROL_STRAIN,
                                       effect_type=None,
                                       linked_test=t_test)
    
    # compile + save t-test results (uncorrected)
    stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
    ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
    
    # correct for multiple comparisons
    pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
    reject_t, pvals_t = _multitest_correct(pvals_t, 
                                           multitest_method=args.fdr_method,
                                           fdr=args.pval_threshold)

    # compile + save t-test results (corrected)
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_corrected.to_csv(ttest_path, header=True, index=True)

    # record t-test significant features (not ordered)
    fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
    #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
    print("LIVE BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\
          (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, 
           args.pval_threshold))

    if len(fset_ttest) > 0:
        ttest_sigfeats_path = stats_dir / '{}_live_sigfeats.txt'.format(t_test)
        write_list_to_file(fset_ttest, ttest_sigfeats_path)

    ##### for DEAD bacteria: compare each strain with control #####
    
    dead_metadata = metadata[metadata['dead']==True]
    dead_features = features.reindex(dead_metadata.index)    

    ttest_path_uncorrected = stats_dir / '{}_dead_uncorrected.csv'.format(t_test)
    ttest_path = stats_dir / '{}_dead_results.csv'.format(t_test)
    ttest_path.parent.mkdir(exist_ok=True, parents=True)
    
    # perform t-tests (without correction for multiple testing)   
    stats_t, pvals_t, reject_t = univariate_tests(X=dead_features, 
                                                  y=dead_metadata[STRAIN_COLNAME], 
                                                  control=CONTROL_STRAIN, 
                                                  test=t_test,
                                                  comparison_type='binary_each_group',
                                                  multitest_correction=None, 
                                                  alpha=0.05)
    
    # get effect sizes for comparisons
    effect_sizes_t =  get_effect_sizes(X=dead_features, 
                                       y=dead_metadata[STRAIN_COLNAME], 
                                       control=CONTROL_STRAIN,
                                       effect_type=None,
                                       linked_test=t_test)
    
    # compile + save t-test results (uncorrected)
    stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
    ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
    
    # correct for multiple comparisons
    pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
    reject_t, pvals_t = _multitest_correct(pvals_t, 
                                           multitest_method=args.fdr_method,
                                           fdr=args.pval_threshold)

    # compile + save t-test results (corrected)
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_corrected.to_csv(ttest_path, header=True, index=True)

    # record t-test significant features (not ordered)
    fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
    #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
    print("DEAD BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\
          (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, 
           args.pval_threshold))

    if len(fset_ttest) > 0:
        ttest_sigfeats_path = stats_dir / '{}_dead_sigfeats.txt'.format(t_test)
        write_list_to_file(fset_ttest, ttest_sigfeats_path)  
コード例 #2
0
    # align bluelight conditions (as separate feature columns)
    features_df, metadata_df = align_bluelight_conditions(
        features_df,
        metadata_df,
        merge_on_cols=['date_yyyymmdd', 'imaging_plate_id', 'well_name'])

    ### clean data

    # remove rows with missing strain information (n=10)
    metadata_df = metadata_df[~metadata_df[args.strain_colname].isna()]
    features_df = features_df.reindex(metadata_df.index)

    # subset for Tierpsy features only
    if args.n_features is not None:
        features_df = select_feat_set(features_df,
                                      tierpsy_set_name='tierpsy_{}'.format(
                                          args.n_features),
                                      append_bluelight=True)

    # subset for given imaging dates
    metadata_df = metadata_df[metadata_df['date_yyyymmdd'].astype(str).isin(
        args.imaging_dates)]
    features_df = features_df.reindex(features_df.index)

    # use tierpsytools functions to features clean data and remove NaNs

    # Drop rows based on percentage of NaN values across features for each row
    # NB: axis=1 will sum the NaNs across all the columns for each row
    features_df = filter_nan_inf(features_df,
                                 threshold=0.8,
                                 axis=1,
                                 verbose=True)
コード例 #3
0
    # Load p-values from confirmational Keio screen
    pvals2 = pd.read_csv(KEIO_CONF_STATS_PATH, index_col=0)
    pvals2 = pvals2[[c for c in pvals2.columns if 'pval' in c]]
    pvals2.columns = [c.split('pvals_')[-1] for c in pvals2.columns]

    # assert index match
    assert set(pvals.index) == set(pvals2.index)

    # subset for shared columns only (strains present in both screens, ie. hit strains)
    shared = list(set(pvals.columns).intersection(set(pvals2.columns)))
    pvals, pvals2 = pvals[shared], pvals2[shared]

    # load Tierpsy top features (and expand for bluelight)
    pvals = select_feat_set(pvals.T,
                            'tierpsy_{}'.format(N_TOP_FEATS),
                            append_bluelight=True).T

    feature_list = pvals.index.to_list()
    strain_list = pvals.columns.to_list()

    # fig, axs = strain_pval_pairplot(pvals, pvals2, strain_list=strain_list,
    #                                 saveAs=Path(SAVE_DIR) / 'pairplot.pdf')

    # initial vs confirm screen - plot p-value corr of significant features for each strain
    errlog = strain_pval_plot(pvals,
                              pvals2,
                              strain_list=strain_list,
                              saveDir=Path(SAVE_DIR) / 'p-value_strain_corr',
                              figsize=(8, 8))
    print(
コード例 #4
0
meta = meta[meta['drug_type'].isin(train_compounds['drug_type'].to_list()+
                                   test_compounds['drug_type'].to_list()
                                   )]
feat = feat.loc[meta.index]

# Impute nans
if align_blue:
    feat = feat.fillna(feat.mean())
else:
    means_cv = {blue: x.mean() for blue,x in feat.groupby(by=meta['bluelight'])}
    feat = [x.fillna(means_cv[blue])
            for blue,x in feat.groupby(by=meta['bluelight'])]
    feat = pd.concat(feat).sort_index()

# Choose tierpsy256
feat = select_feat_set(feat, tierpsy_set_name='tierpsy_256', append_bluelight=align_blue)

#%% Get average doses
# Get the DMSO points
dmso_ids = meta['drug_type']=='DMSO'
meta_dmso = meta[dmso_ids]

splitter = StratifiedKFold(n_splits=6)
for i,(_, idx) in enumerate(splitter.split(meta_dmso, meta_dmso['date_yyyymmdd'])):
    dfidx = meta_dmso.index[idx]
    meta_dmso.loc[dfidx, 'drug_type'] = 'DMSO_{}'.format(i)

meta[dmso_ids] = meta_dmso

# Get average doses for remaining compounds
feat_cols = feat.columns
コード例 #5
0
def analyse_acute_rescue(features, 
                         metadata,
                         save_dir,
                         control_strain, 
                         control_antioxidant, 
                         control_window,
                         fdr_method='fdr_by',
                         pval_threshold=0.05,
                         remove_outliers=False):
 
    stats_dir =  Path(save_dir) / "Stats" / fdr_method
    plot_dir = Path(save_dir) / "Plots" / fdr_method

    strain_list = [control_strain] + [s for s in metadata['gene_name'].unique() if s != control_strain]  
    antiox_list = [control_antioxidant] + [a for a in metadata['antioxidant'].unique() if 
                                           a != control_antioxidant]
    window_list = [control_window] + [w for w in metadata['window'].unique() if w != control_window]

    # categorical variables to investigate: 'gene_name', 'antioxidant' and 'window'
    print("\nInvestigating difference in fraction of worms paused between hit strain and control " +
          "(for each window), in the presence/absence of antioxidants:\n")    

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=['gene_name', 'antioxidant', 'window'])
    print("Mean sample size of strain/antioxidant for each window: %d" %\
          (int(sample_size['n_samples'].mean())))
            
    # plot dates as different colours (in loop)
    date_lut = dict(zip(list(metadata['date_yyyymmdd'].unique()), 
                        sns.color_palette('Set1', n_colors=len(metadata['date_yyyymmdd'].unique()))))
        
    for strain in strain_list[1:]: # skip control_strain at first index postion        
        plot_meta = metadata[np.logical_or(metadata['gene_name']==strain, 
                                           metadata['gene_name']==control_strain)]
        plot_feat = features.reindex(plot_meta.index)
        plot_df = plot_meta.join(plot_feat[[FEATURE]])
        
        # Is there a difference between strain vs control at any window? (pooled antioxidant data)
        print("Plotting windows for %s vs control" % strain)
        plt.close('all')
        fig, ax = plt.subplots(figsize=((len(window_list) if len(window_list) >= 20 else 12),8))
        ax = sns.boxplot(x='window', y=FEATURE, hue='gene_name', hue_order=strain_list, order=window_list,
                         data=plot_df, palette='Set3', dodge=True, ax=ax)
        for date in date_lut.keys():
            date_df = plot_df[plot_df['date_yyyymmdd']==date]   
            ax = sns.stripplot(x='window', y=FEATURE, hue='gene_name', order=window_list,
                               hue_order=strain_list, data=date_df, 
                               palette={control_strain:date_lut[date], strain:date_lut[date]}, 
                               alpha=0.7, size=4, dodge=True, ax=ax)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
                
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('window')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(-0.02, max(y_bar) + 3 * max(IQR))
            
        # load t-test results + annotate p-values on plot
        for ii, window in enumerate(window_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\
                                '{}_window_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, str(window)]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == str(window)
            p_text = 'P<0.001' if p < 0.001 else 'P=%.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            plt.plot([ii-.3, ii-.3, ii+.3, ii+.3], 
                     [0.98, 0.99, 0.99, 0.98], #[y+h, y+2*h, y+2*h, y+h], 
                     lw=1.5, c='k', transform=trans)
            ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans,
                    rotation=(0 if len(window_list) <= 20 else 90))
            
        ax.set_xticks(range(len(window_list)+1))
        xlabels = [str(int(WINDOW_FRAME_DICT[w][0]/60)) for w in window_list]
        ax.set_xticklabels(xlabels)
        x_text = 'Time (minutes)' if ALL_WINDOWS else 'Time of bluelight 10-second burst (minutes)'
        ax.set_xlabel(x_text, fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        fig_savepath = plot_dir / 'window_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
    
    
        # Is there a difference between strain vs control for any antioxidant? (pooled window data)
        plt.close('all')
        fig, ax = plt.subplots(figsize=(10,8))
        ax = sns.boxplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df,
                          palette='Set3', dodge=True, order=antiox_list)
        ax = sns.swarmplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df,
                          color='k', alpha=0.7, size=4, dodge=True, order=antiox_list)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
        ax.set_xlabel('antioxidant', fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('antioxidant')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR))
            
        # annotate p-values
        for ii, antiox in enumerate(antiox_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\
                                '{}_antioxidant_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, antiox]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == antiox
            p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], 
                      [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], 
                      lw=1.5, c='k', transform=trans)
            ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                
        fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
        
    # Plot for each strain separately to see whether antioxidants had an effect at all
    for strain in strain_list:
            
        plt.close('all')
        fig, ax = plt.subplots(figsize=(10,8))
        ax = sns.boxplot(x='antioxidant', y=FEATURE, order=antiox_list, 
                         dodge=True, data=plot_df[plot_df['gene_name']==strain])
        ax = sns.swarmplot(x='antioxidant', y=FEATURE, order=antiox_list, 
                           dodge=True, data=plot_df[plot_df['gene_name']==strain],
                           alpha=0.7, size=4, color='k')        
        n_labs = len(plot_df['antioxidant'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
        ax.set_xlabel('antioxidant', fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('antioxidant')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(min(y_bar) - 1 * max(IQR), max(y_bar) + 2.5 * max(IQR))
            
        # annotate p-values
        for ii, antiox in enumerate(antiox_list):
            if antiox == control_antioxidant:
                continue
            # load antioxidant results for strain
            ttest_strain_path = stats_dir / 't-test_{}_antioxidant_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, antiox]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == antiox
            p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            #plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.98, 0.99, 0.98, 0.99], lw=1.5, c='k', transform=trans)
            ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                
        plt.title(strain, fontsize=18, pad=30)
        fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
        
        
    # Hierarchical Clustering Analysis
    #   - Clustermap of features by strain, to see if data cluster into groups
    #   - Control data is clustered first, feature order is stored and ordering applied to 
    #     full data for comparison
    
    # subset for Tierpsy top16 features only
    features = select_feat_set(features, tierpsy_set_name='tierpsy_16', append_bluelight=False)
    
    # Ensure no NaNs or features with zero standard deviation before normalisation
    assert not features.isna().sum(axis=0).any()
    assert not (features.std(axis=0) == 0).any()
       
    # Extract data for control
    control_feat_df = features[metadata['gene_name']==control_strain]
    control_meta_df = metadata.reindex(control_feat_df.index)
    
    control_feat_df, control_meta_df = clean_summary_results(features=control_feat_df,
                                                             metadata=control_meta_df,
                                                             imputeNaN=False)
    

    #zscores = (df-df.mean())/df.std() # minus mean, divide by std
    controlZ_feat_df = control_feat_df.apply(zscore, axis=0)

    # plot clustermap for control        
    control_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format(control_strain)
    cg = plot_clustermap(featZ=controlZ_feat_df,
                         meta=control_meta_df,
                         row_colours=True,
                         group_by=['gene_name','antioxidant'],
                         col_linkage=None,
                         method='complete',#[linkage, complete, average, weighted, centroid]
                         figsize=(20,10),
                         show_xlabels=True,
                         label_size=15,
                         sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                         saveto=control_clustermap_path,
                         bluelight_col_colours=False)

    # extract clustered feature order
    clustered_features = np.array(controlZ_feat_df.columns)[cg.dendrogram_col.reordered_ind]
     
    featZ_df = features.apply(zscore, axis=0)
    
    # Save stats table to CSV   
    # if not stats_path.exists():
    #     # Add z-normalised values
    #     z_stats = featZ_df.join(meta_df[GROUPING_VAR]).groupby(by=GROUPING_VAR).mean().T
    #     z_mean_cols = ['z-mean ' + v for v in z_stats.columns.to_list()]
    #     z_stats.columns = z_mean_cols
    #     stats_table = stats_table.join(z_stats)
    #     first_cols = [m for m in stats_table.columns if 'mean' in m]
    #     last_cols = [c for c in stats_table.columns if c not in first_cols]
    #     first_cols.extend(last_cols)
    #     stats_table = stats_table[first_cols].reset_index()
    #     first_cols.insert(0, 'feature')
    #     stats_table.columns = first_cols
    #     stats_table['feature'] = [' '.join(f.split('_')) for f in stats_table['feature']]
    #     stats_table = stats_table.sort_values(by='{} p-value'.format((T_TEST_NAME if 
    #                                  len(run_strain_list) == 2 else TEST_NAME)), ascending=True)
    #     stats_table_path = stats_dir / 'stats_summary_table.csv'
    #     stats_table.to_csv(stats_table_path, header=True, index=None)
    
    # Clustermap of full data - antioxidants  
    full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_antioxidant')
    _ = plot_clustermap(featZ=featZ_df,
                        meta=metadata, 
                        group_by=['gene_name','antioxidant'],
                        col_linkage=None,
                        method='complete',
                        figsize=(20,10),
                        show_xlabels=True,
                        label_size=15,
                        sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                        saveto=full_clustermap_path,
                        bluelight_col_colours=False)

    # Heatmap of strain/antioxidant treatment, ordered by control clustered feature order
    heatmap_date_path = plot_dir / 'heatmaps' / 'gene_antioxidant_heatmap.pdf'
    plot_barcode_heatmap(featZ=featZ_df[clustered_features], 
                         meta=metadata, 
                         group_by=['gene_name','antioxidant'], 
                         pvalues_series=None,
                         saveto=heatmap_date_path,
                         figsize=(20,6),
                         sns_colour_palette="Pastel1")    
      
    # Clustermap of full data - windows  
    full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_window')
    _ = plot_clustermap(featZ=featZ_df,
                        meta=metadata, 
                        group_by=['gene_name','window'],
                        col_linkage=None,
                        method='complete',
                        figsize=(20,10),
                        show_xlabels=True,
                        label_size=15,
                        sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                        saveto=full_clustermap_path,
                        bluelight_col_colours=False)
                  
    # Principal Components Analysis (PCA)

    if remove_outliers:
        outlier_path = plot_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, 
                                            features_to_analyse=None, 
                                            saveto=outlier_path)
        metadata = metadata.reindex(features.index)
        featZ_df = features.apply(zscore, axis=0)
  
    # project data + plot PCA
    #from tierpsytools.analysis.decomposition import plot_pca
    pca_dir = plot_dir / 'PCA'
    _ = plot_pca(featZ=featZ_df, 
                 meta=metadata, 
                 group_by='gene_name', 
                 n_dims=2,
                 control=control_strain,
                 var_subset=None, 
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 sns_colour_palette="Set1",
                 figsize=(12,8),
                 sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7},
                 legend_loc=[1.02,0.6],
                 hypercolor=False) 
         
    # t-distributed Stochastic Neighbour Embedding (tSNE)

    tsne_dir = plot_dir / 'tSNE'
    perplexities = [5,15,30] # NB: perplexity parameter should be roughly equal to group size
    
    _ = plot_tSNE(featZ=featZ_df,
                  meta=metadata,
                  group_by='gene_name',
                  var_subset=None,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=(8,8),
                  label_size=15,
                  size=20,
                  sns_colour_palette="Set1")
   
    # Uniform Manifold Projection (UMAP)

    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [5,15,30] # NB: n_neighbours parameter should be roughly equal to group size
    min_dist = 0.1 # Minimum distance parameter
    
    _ = plot_umap(featZ=featZ_df,
                  meta=metadata,
                  group_by='gene_name',
                  var_subset=None,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=(8,8),
                  label_size=15,
                  size=20,
                  sns_colour_palette="Set1")
    
    _ = plot_pca_2var(featZ=featZ_df, 
                      meta=metadata, 
                      var1='gene_name',
                      var2='antioxidant',
                      saveDir=pca_dir,
                      PCs_to_keep=10,
                      n_feats2print=10,
                      sns_colour_palette="Set1",
                      label_size=15,
                      figsize=[9,8],
                      sub_adj={'bottom':0,'left':0,'top':1,'right':1})

    return
コード例 #6
0
    args = load_json(args.json)
    
    if FEATURES_PATH is None:
        FEATURES_PATH = Path(args.save_dir) / 'features.csv'
    if METADATA_PATH is None:
        METADATA_PATH = Path(args.save_dir) / 'metadata.csv'
        
    # Read clean feature summaries + metadata
    print("Loading metadata and feature summary results...")
    features = pd.read_csv(FEATURES_PATH)
    metadata = pd.read_csv(METADATA_PATH, dtype={'comments':str, 'source_plate_id':str})

    # Subset for desired imaging dates
    if args.dates is not None:
        assert type(args.dates) == list
        metadata = metadata.loc[metadata['date_yyyymmdd'].astype(str).isin(args.dates)]
        features = features.reindex(metadata.index)

    # Single feature only, or tierpsy feature set?
    if FEATURE is not None:
        features = features[[FEATURE]]
    else:
        # Load Tierpsy feature set + subset (columns) for selected features only
        features = select_feat_set(features, 'tierpsy_{}'.format(args.n_top_feats), append_bluelight=True)
        features = features[[f for f in features.columns if 'path_curvature' not in f]]

    compare_keio_rescue(features, metadata, args)
    
    toc = time()
    print("\nDone in %.1f seconds (%.1f minutes)" % (toc - tic, (toc - tic) / 60))