def dead_keio_stats(features, metadata, args): """ Perform statistical analyses on dead Keio experiment results: - t-tests for each feature comparing each strain vs control for paired antioxidant treatment conditions - t-tests for each feature comparing each strain antioxidant treatment to negative control (no antioxidant) Inputs ------ features, metadata : pd.DataFrame Clean feature summaries and accompanying metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - control_dict : dict - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ print("\nInvestigating variation in worm behaviour on dead vs alive hit Keio strains") # assert there will be no errors due to case-sensitivity assert len(metadata[STRAIN_COLNAME].unique()) == len(metadata[STRAIN_COLNAME].str.upper().unique()) assert all(type(b) == np.bool_ for b in metadata[TREATMENT_COLNAME].unique()) # Load Tierpsy feature set + subset (columns) for selected features only features = select_feat_set(features, 'tierpsy_{}'.format(args.n_top_feats), append_bluelight=True) features = features[[f for f in features.columns if 'path_curvature' not in f]] assert not features.isna().any().any() #n_feats = features.shape[1] strain_list = list(metadata[STRAIN_COLNAME].unique()) assert CONTROL_STRAIN in strain_list # print mean sample size sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME]) print("Mean sample size of %s: %d" % (STRAIN_COLNAME, int(sample_size['n_samples'].mean()))) # construct save paths (args.save_dir / topfeats? etc) save_dir = get_save_dir(args) stats_dir = save_dir / "Stats" / args.fdr_method ##### ANOVA ##### # make path to save ANOVA results test_path = stats_dir / 'ANOVA_results.csv' test_path.parent.mkdir(exist_ok=True, parents=True) # ANOVA across strains for significant feature differences if len(metadata[STRAIN_COLNAME].unique()) > 2: stats, pvals, reject = univariate_tests(X=features, y=metadata[STRAIN_COLNAME], test='ANOVA', control=CONTROL_STRAIN, comparison_type='multiclass', multitest_correction=None, # uncorrected alpha=args.pval_threshold, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=features, y=metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test='ANOVA') # correct for multiple comparisons reject_corrected, pvals_corrected = _multitest_correct(pvals, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save results (corrected) test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path, header=True, index=True) nsig = test_results['reject'].sum() print("%d features (%.f%%) signficantly different among '%s'" % (nsig, len(test_results.index)/nsig, STRAIN_COLNAME)) ##### t-tests ##### for strain in strain_list: strain_meta = metadata[metadata[STRAIN_COLNAME]==strain] strain_feat = features.reindex(strain_meta.index) ### t-tests for each feature comparing live vs dead behaviour ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format((t_test + '_' + strain)) ttest_path = stats_dir / '{}_results.csv'.format((t_test + '_' + strain)) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=strain_feat, y=strain_meta[TREATMENT_COLNAME], control=CONTROL_TREATMENT, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=strain_feat, y=strain_meta[TREATMENT_COLNAME], control=CONTROL_TREATMENT, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("%d significant features for %s on any %s vs %s (%s, %s, P<%.2f)" % (len(fset_ttest), strain, TREATMENT_COLNAME, CONTROL_TREATMENT, t_test, args.fdr_method, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format((t_test + '_' + strain)) write_list_to_file(fset_ttest, ttest_sigfeats_path) ##### for LIVE bacteria: compare each strain with control ##### live_metadata = metadata[metadata['dead']==False] live_features = features.reindex(live_metadata.index) ttest_path_uncorrected = stats_dir / '{}_live_uncorrected.csv'.format(t_test) ttest_path = stats_dir / '{}_live_results.csv'.format(t_test) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=live_features, y=live_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=live_features, y=live_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("LIVE BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\ (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_live_sigfeats.txt'.format(t_test) write_list_to_file(fset_ttest, ttest_sigfeats_path) ##### for DEAD bacteria: compare each strain with control ##### dead_metadata = metadata[metadata['dead']==True] dead_features = features.reindex(dead_metadata.index) ttest_path_uncorrected = stats_dir / '{}_dead_uncorrected.csv'.format(t_test) ttest_path = stats_dir / '{}_dead_results.csv'.format(t_test) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=dead_features, y=dead_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=dead_features, y=dead_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("DEAD BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\ (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_dead_sigfeats.txt'.format(t_test) write_list_to_file(fset_ttest, ttest_sigfeats_path)
# align bluelight conditions (as separate feature columns) features_df, metadata_df = align_bluelight_conditions( features_df, metadata_df, merge_on_cols=['date_yyyymmdd', 'imaging_plate_id', 'well_name']) ### clean data # remove rows with missing strain information (n=10) metadata_df = metadata_df[~metadata_df[args.strain_colname].isna()] features_df = features_df.reindex(metadata_df.index) # subset for Tierpsy features only if args.n_features is not None: features_df = select_feat_set(features_df, tierpsy_set_name='tierpsy_{}'.format( args.n_features), append_bluelight=True) # subset for given imaging dates metadata_df = metadata_df[metadata_df['date_yyyymmdd'].astype(str).isin( args.imaging_dates)] features_df = features_df.reindex(features_df.index) # use tierpsytools functions to features clean data and remove NaNs # Drop rows based on percentage of NaN values across features for each row # NB: axis=1 will sum the NaNs across all the columns for each row features_df = filter_nan_inf(features_df, threshold=0.8, axis=1, verbose=True)
# Load p-values from confirmational Keio screen pvals2 = pd.read_csv(KEIO_CONF_STATS_PATH, index_col=0) pvals2 = pvals2[[c for c in pvals2.columns if 'pval' in c]] pvals2.columns = [c.split('pvals_')[-1] for c in pvals2.columns] # assert index match assert set(pvals.index) == set(pvals2.index) # subset for shared columns only (strains present in both screens, ie. hit strains) shared = list(set(pvals.columns).intersection(set(pvals2.columns))) pvals, pvals2 = pvals[shared], pvals2[shared] # load Tierpsy top features (and expand for bluelight) pvals = select_feat_set(pvals.T, 'tierpsy_{}'.format(N_TOP_FEATS), append_bluelight=True).T feature_list = pvals.index.to_list() strain_list = pvals.columns.to_list() # fig, axs = strain_pval_pairplot(pvals, pvals2, strain_list=strain_list, # saveAs=Path(SAVE_DIR) / 'pairplot.pdf') # initial vs confirm screen - plot p-value corr of significant features for each strain errlog = strain_pval_plot(pvals, pvals2, strain_list=strain_list, saveDir=Path(SAVE_DIR) / 'p-value_strain_corr', figsize=(8, 8)) print(
meta = meta[meta['drug_type'].isin(train_compounds['drug_type'].to_list()+ test_compounds['drug_type'].to_list() )] feat = feat.loc[meta.index] # Impute nans if align_blue: feat = feat.fillna(feat.mean()) else: means_cv = {blue: x.mean() for blue,x in feat.groupby(by=meta['bluelight'])} feat = [x.fillna(means_cv[blue]) for blue,x in feat.groupby(by=meta['bluelight'])] feat = pd.concat(feat).sort_index() # Choose tierpsy256 feat = select_feat_set(feat, tierpsy_set_name='tierpsy_256', append_bluelight=align_blue) #%% Get average doses # Get the DMSO points dmso_ids = meta['drug_type']=='DMSO' meta_dmso = meta[dmso_ids] splitter = StratifiedKFold(n_splits=6) for i,(_, idx) in enumerate(splitter.split(meta_dmso, meta_dmso['date_yyyymmdd'])): dfidx = meta_dmso.index[idx] meta_dmso.loc[dfidx, 'drug_type'] = 'DMSO_{}'.format(i) meta[dmso_ids] = meta_dmso # Get average doses for remaining compounds feat_cols = feat.columns
def analyse_acute_rescue(features, metadata, save_dir, control_strain, control_antioxidant, control_window, fdr_method='fdr_by', pval_threshold=0.05, remove_outliers=False): stats_dir = Path(save_dir) / "Stats" / fdr_method plot_dir = Path(save_dir) / "Plots" / fdr_method strain_list = [control_strain] + [s for s in metadata['gene_name'].unique() if s != control_strain] antiox_list = [control_antioxidant] + [a for a in metadata['antioxidant'].unique() if a != control_antioxidant] window_list = [control_window] + [w for w in metadata['window'].unique() if w != control_window] # categorical variables to investigate: 'gene_name', 'antioxidant' and 'window' print("\nInvestigating difference in fraction of worms paused between hit strain and control " + "(for each window), in the presence/absence of antioxidants:\n") # print mean sample size sample_size = df_summary_stats(metadata, columns=['gene_name', 'antioxidant', 'window']) print("Mean sample size of strain/antioxidant for each window: %d" %\ (int(sample_size['n_samples'].mean()))) # plot dates as different colours (in loop) date_lut = dict(zip(list(metadata['date_yyyymmdd'].unique()), sns.color_palette('Set1', n_colors=len(metadata['date_yyyymmdd'].unique())))) for strain in strain_list[1:]: # skip control_strain at first index postion plot_meta = metadata[np.logical_or(metadata['gene_name']==strain, metadata['gene_name']==control_strain)] plot_feat = features.reindex(plot_meta.index) plot_df = plot_meta.join(plot_feat[[FEATURE]]) # Is there a difference between strain vs control at any window? (pooled antioxidant data) print("Plotting windows for %s vs control" % strain) plt.close('all') fig, ax = plt.subplots(figsize=((len(window_list) if len(window_list) >= 20 else 12),8)) ax = sns.boxplot(x='window', y=FEATURE, hue='gene_name', hue_order=strain_list, order=window_list, data=plot_df, palette='Set3', dodge=True, ax=ax) for date in date_lut.keys(): date_df = plot_df[plot_df['date_yyyymmdd']==date] ax = sns.stripplot(x='window', y=FEATURE, hue='gene_name', order=window_list, hue_order=strain_list, data=date_df, palette={control_strain:date_lut[date], strain:date_lut[date]}, alpha=0.7, size=4, dodge=True, ax=ax) n_labs = len(plot_df['gene_name'].unique()) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right') # scale plot to omit outliers (>2.5*IQR from mean) if scale_outliers_box: grouped_strain = plot_df.groupby('window') y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers # Computing IQR Q1 = grouped_strain[FEATURE].quantile(0.25) Q3 = grouped_strain[FEATURE].quantile(0.75) IQR = Q3 - Q1 plt.ylim(-0.02, max(y_bar) + 3 * max(IQR)) # load t-test results + annotate p-values on plot for ii, window in enumerate(window_list): ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\ '{}_window_results.csv'.format(strain) ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0) strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] p = strain_pvals_t.loc[FEATURE, str(window)] text = ax.get_xticklabels()[ii] assert text.get_text() == str(window) p_text = 'P<0.001' if p < 0.001 else 'P=%.3f' % p #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max() #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50 trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) plt.plot([ii-.3, ii-.3, ii+.3, ii+.3], [0.98, 0.99, 0.99, 0.98], #[y+h, y+2*h, y+2*h, y+h], lw=1.5, c='k', transform=trans) ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans, rotation=(0 if len(window_list) <= 20 else 90)) ax.set_xticks(range(len(window_list)+1)) xlabels = [str(int(WINDOW_FRAME_DICT[w][0]/60)) for w in window_list] ax.set_xticklabels(xlabels) x_text = 'Time (minutes)' if ALL_WINDOWS else 'Time of bluelight 10-second burst (minutes)' ax.set_xlabel(x_text, fontsize=15, labelpad=10) ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10) fig_savepath = plot_dir / 'window_boxplots' / strain / (FEATURE + '.png') fig_savepath.parent.mkdir(parents=True, exist_ok=True) plt.savefig(fig_savepath) # Is there a difference between strain vs control for any antioxidant? (pooled window data) plt.close('all') fig, ax = plt.subplots(figsize=(10,8)) ax = sns.boxplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df, palette='Set3', dodge=True, order=antiox_list) ax = sns.swarmplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df, color='k', alpha=0.7, size=4, dodge=True, order=antiox_list) n_labs = len(plot_df['gene_name'].unique()) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right') ax.set_xlabel('antioxidant', fontsize=15, labelpad=10) ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10) # scale plot to omit outliers (>2.5*IQR from mean) if scale_outliers_box: grouped_strain = plot_df.groupby('antioxidant') y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers # Computing IQR Q1 = grouped_strain[FEATURE].quantile(0.25) Q3 = grouped_strain[FEATURE].quantile(0.75) IQR = Q3 - Q1 plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR)) # annotate p-values for ii, antiox in enumerate(antiox_list): ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\ '{}_antioxidant_results.csv'.format(strain) ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0) strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] p = strain_pvals_t.loc[FEATURE, antiox] text = ax.get_xticklabels()[ii] assert text.get_text() == antiox p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max() #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50 trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], lw=1.5, c='k', transform=trans) ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans) fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png') fig_savepath.parent.mkdir(parents=True, exist_ok=True) plt.savefig(fig_savepath) # Plot for each strain separately to see whether antioxidants had an effect at all for strain in strain_list: plt.close('all') fig, ax = plt.subplots(figsize=(10,8)) ax = sns.boxplot(x='antioxidant', y=FEATURE, order=antiox_list, dodge=True, data=plot_df[plot_df['gene_name']==strain]) ax = sns.swarmplot(x='antioxidant', y=FEATURE, order=antiox_list, dodge=True, data=plot_df[plot_df['gene_name']==strain], alpha=0.7, size=4, color='k') n_labs = len(plot_df['antioxidant'].unique()) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right') ax.set_xlabel('antioxidant', fontsize=15, labelpad=10) ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10) # scale plot to omit outliers (>2.5*IQR from mean) if scale_outliers_box: grouped_strain = plot_df.groupby('antioxidant') y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers # Computing IQR Q1 = grouped_strain[FEATURE].quantile(0.25) Q3 = grouped_strain[FEATURE].quantile(0.75) IQR = Q3 - Q1 plt.ylim(min(y_bar) - 1 * max(IQR), max(y_bar) + 2.5 * max(IQR)) # annotate p-values for ii, antiox in enumerate(antiox_list): if antiox == control_antioxidant: continue # load antioxidant results for strain ttest_strain_path = stats_dir / 't-test_{}_antioxidant_results.csv'.format(strain) ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0) strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] p = strain_pvals_t.loc[FEATURE, antiox] text = ax.get_xticklabels()[ii] assert text.get_text() == antiox p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) #plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.98, 0.99, 0.98, 0.99], lw=1.5, c='k', transform=trans) ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans) plt.title(strain, fontsize=18, pad=30) fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png') fig_savepath.parent.mkdir(parents=True, exist_ok=True) plt.savefig(fig_savepath) # Hierarchical Clustering Analysis # - Clustermap of features by strain, to see if data cluster into groups # - Control data is clustered first, feature order is stored and ordering applied to # full data for comparison # subset for Tierpsy top16 features only features = select_feat_set(features, tierpsy_set_name='tierpsy_16', append_bluelight=False) # Ensure no NaNs or features with zero standard deviation before normalisation assert not features.isna().sum(axis=0).any() assert not (features.std(axis=0) == 0).any() # Extract data for control control_feat_df = features[metadata['gene_name']==control_strain] control_meta_df = metadata.reindex(control_feat_df.index) control_feat_df, control_meta_df = clean_summary_results(features=control_feat_df, metadata=control_meta_df, imputeNaN=False) #zscores = (df-df.mean())/df.std() # minus mean, divide by std controlZ_feat_df = control_feat_df.apply(zscore, axis=0) # plot clustermap for control control_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format(control_strain) cg = plot_clustermap(featZ=controlZ_feat_df, meta=control_meta_df, row_colours=True, group_by=['gene_name','antioxidant'], col_linkage=None, method='complete',#[linkage, complete, average, weighted, centroid] figsize=(20,10), show_xlabels=True, label_size=15, sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85}, saveto=control_clustermap_path, bluelight_col_colours=False) # extract clustered feature order clustered_features = np.array(controlZ_feat_df.columns)[cg.dendrogram_col.reordered_ind] featZ_df = features.apply(zscore, axis=0) # Save stats table to CSV # if not stats_path.exists(): # # Add z-normalised values # z_stats = featZ_df.join(meta_df[GROUPING_VAR]).groupby(by=GROUPING_VAR).mean().T # z_mean_cols = ['z-mean ' + v for v in z_stats.columns.to_list()] # z_stats.columns = z_mean_cols # stats_table = stats_table.join(z_stats) # first_cols = [m for m in stats_table.columns if 'mean' in m] # last_cols = [c for c in stats_table.columns if c not in first_cols] # first_cols.extend(last_cols) # stats_table = stats_table[first_cols].reset_index() # first_cols.insert(0, 'feature') # stats_table.columns = first_cols # stats_table['feature'] = [' '.join(f.split('_')) for f in stats_table['feature']] # stats_table = stats_table.sort_values(by='{} p-value'.format((T_TEST_NAME if # len(run_strain_list) == 2 else TEST_NAME)), ascending=True) # stats_table_path = stats_dir / 'stats_summary_table.csv' # stats_table.to_csv(stats_table_path, header=True, index=None) # Clustermap of full data - antioxidants full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_antioxidant') _ = plot_clustermap(featZ=featZ_df, meta=metadata, group_by=['gene_name','antioxidant'], col_linkage=None, method='complete', figsize=(20,10), show_xlabels=True, label_size=15, sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85}, saveto=full_clustermap_path, bluelight_col_colours=False) # Heatmap of strain/antioxidant treatment, ordered by control clustered feature order heatmap_date_path = plot_dir / 'heatmaps' / 'gene_antioxidant_heatmap.pdf' plot_barcode_heatmap(featZ=featZ_df[clustered_features], meta=metadata, group_by=['gene_name','antioxidant'], pvalues_series=None, saveto=heatmap_date_path, figsize=(20,6), sns_colour_palette="Pastel1") # Clustermap of full data - windows full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_window') _ = plot_clustermap(featZ=featZ_df, meta=metadata, group_by=['gene_name','window'], col_linkage=None, method='complete', figsize=(20,10), show_xlabels=True, label_size=15, sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85}, saveto=full_clustermap_path, bluelight_col_colours=False) # Principal Components Analysis (PCA) if remove_outliers: outlier_path = plot_dir / 'mahalanobis_outliers.pdf' features, inds = remove_outliers_pca(df=features, features_to_analyse=None, saveto=outlier_path) metadata = metadata.reindex(features.index) featZ_df = features.apply(zscore, axis=0) # project data + plot PCA #from tierpsytools.analysis.decomposition import plot_pca pca_dir = plot_dir / 'PCA' _ = plot_pca(featZ=featZ_df, meta=metadata, group_by='gene_name', n_dims=2, control=control_strain, var_subset=None, saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, sns_colour_palette="Set1", figsize=(12,8), sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7}, legend_loc=[1.02,0.6], hypercolor=False) # t-distributed Stochastic Neighbour Embedding (tSNE) tsne_dir = plot_dir / 'tSNE' perplexities = [5,15,30] # NB: perplexity parameter should be roughly equal to group size _ = plot_tSNE(featZ=featZ_df, meta=metadata, group_by='gene_name', var_subset=None, saveDir=tsne_dir, perplexities=perplexities, figsize=(8,8), label_size=15, size=20, sns_colour_palette="Set1") # Uniform Manifold Projection (UMAP) umap_dir = plot_dir / 'UMAP' n_neighbours = [5,15,30] # NB: n_neighbours parameter should be roughly equal to group size min_dist = 0.1 # Minimum distance parameter _ = plot_umap(featZ=featZ_df, meta=metadata, group_by='gene_name', var_subset=None, saveDir=umap_dir, n_neighbours=n_neighbours, min_dist=min_dist, figsize=(8,8), label_size=15, size=20, sns_colour_palette="Set1") _ = plot_pca_2var(featZ=featZ_df, meta=metadata, var1='gene_name', var2='antioxidant', saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, sns_colour_palette="Set1", label_size=15, figsize=[9,8], sub_adj={'bottom':0,'left':0,'top':1,'right':1}) return
args = load_json(args.json) if FEATURES_PATH is None: FEATURES_PATH = Path(args.save_dir) / 'features.csv' if METADATA_PATH is None: METADATA_PATH = Path(args.save_dir) / 'metadata.csv' # Read clean feature summaries + metadata print("Loading metadata and feature summary results...") features = pd.read_csv(FEATURES_PATH) metadata = pd.read_csv(METADATA_PATH, dtype={'comments':str, 'source_plate_id':str}) # Subset for desired imaging dates if args.dates is not None: assert type(args.dates) == list metadata = metadata.loc[metadata['date_yyyymmdd'].astype(str).isin(args.dates)] features = features.reindex(metadata.index) # Single feature only, or tierpsy feature set? if FEATURE is not None: features = features[[FEATURE]] else: # Load Tierpsy feature set + subset (columns) for selected features only features = select_feat_set(features, 'tierpsy_{}'.format(args.n_top_feats), append_bluelight=True) features = features[[f for f in features.columns if 'path_curvature' not in f]] compare_keio_rescue(features, metadata, args) toc = time() print("\nDone in %.1f seconds (%.1f minutes)" % (toc - tic, (toc - tic) / 60))