Example #1
0
                     Path(args.results_dir).parent / "Analysis" / "Superplots")

    combined_feats_path = Path(args.results_dir) / "full_features.csv"
    combined_fnames_path = Path(args.results_dir) / "full_filenames.csv"
    
    # NB: leaves the df in a "long format" that seaborn 'likes'   
    features, metadata = read_hydra_metadata(feat_file=combined_feats_path,
                                             fname_file=combined_fnames_path,
                                             meta_file=args.compiled_metadata_path,
                                             add_bluelight=True)

    # Convert metadata column dtypes, ie. stringsAsFactors, no floats, Δ, etc
    metadata = fix_dtypes(metadata)
    metadata['food_type'] = [f.replace("Δ","_") for f in metadata['food_type']]
    
    features, metadata = clean_summary_results(features, metadata)
        
    # Load feature list from file
    if args.feature_list_from_csv is not None:
        assert Path(args.feature_list_from_csv).exists()
        
        feature_list = pd.read_csv(args.feature_list_from_csv)
        feature_list = list(feature_list[feature_list.columns[0]].unique())
    elif args.n_top_feats is not None:
        top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats))        
        topfeats = load_topfeats(top_feats_path, add_bluelight=True, 
                                 remove_path_curvature=True, header=None)

        # Drop features that are not in results
        feature_list = [feat for feat in list(topfeats) if feat in features.columns]
        features = features[feature_list]
Example #2
0
     # compile window summaries
     features, metadata = process_feature_summaries(metadata_path=metadata_path, 
                                                    results_dir=RES_DIR, 
                                                    compile_day_summaries=True,
                                                    imaging_dates=IMAGING_DATES, 
                                                    align_bluelight=False,
                                                    window_summaries=True,
                                                    n_wells=N_WELLS)
     
     # clean results
     features, metadata = clean_summary_results(features, 
                                                metadata,
                                                feature_columns=None,
                                                nan_threshold_row=NAN_THRESHOLD_ROW,
                                                nan_threshold_col=NAN_THRESHOLD_COL,
                                                max_value_cap=1e15,
                                                imputeNaN=True,
                                                min_nskel_per_video=MIN_NSKEL_PER_VIDEO,
                                                min_nskel_sum=MIN_NSKEL_SUM,
                                                drop_size_related_feats=False,
                                                norm_feats_only=False,
                                                percentile_to_use=None)
 
     assert not features.isna().sum(axis=1).any()
     assert not (features.std(axis=1) == 0).any()
 
     # save features
     metadata.to_csv(META_PATH, index=False)
     features.to_csv(FEAT_PATH, index=False) 
 
 else:
     # load clean metadata and features
                                               imaging_dates=args.dates, 
                                               add_well_annotations=args.add_well_annotations)
    
    # Process feature summary results
    features, metadata = process_feature_summaries(metadata_path, 
                                                   RESULTS_DIR,
                                                   compile_day_summaries=args.compile_day_summaries,
                                                   imaging_dates=args.dates,
                                                   align_bluelight=args.align_bluelight)
    
    # Clean: remove data with too many NaNs/zero std and impute remaining NaNs
    features, metadata = clean_summary_results(features, 
                                               metadata,
                                               feature_columns=None,
                                               imputeNaN=args.impute_nans,
                                               nan_threshold=args.nan_threshold,
                                               max_value_cap=args.max_value_cap,
                                               drop_size_related_feats=args.drop_size_features,
                                               norm_feats_only=args.norm_features_only,
                                               percentile_to_use=args.percentile_to_use)
    
    # Load supplementary info + append to metadata
    if not 'COG category' in metadata.columns:
        supplementary_7 = load_supplementary_7(args.path_sup_info)
        updated_metadata = append_supplementary_7(metadata, supplementary_7)
        
    # # Calculate duration on food + duration in L1 diapause
    # metadata = duration_on_food(metadata) 
    # metadata = duration_L1_diapause(metadata)

    #%% Subset results
Example #4
0
        for s in metadata['gene_name']
    ]
    #['BW\u0394'+g if not g == 'BW' else 'wild_type' for g in metadata['gene_name']]

    # Create is_bad_well column - refer to manual metadata for bad 35mm petri plates
    metadata['is_bad_well'] = False

    # Clean results - Remove bad well data + features with too many NaNs/zero std
    #                                      + impute remaining NaNs
    features, metadata = clean_summary_results(
        features,
        metadata,
        feature_columns=None,
        nan_threshold_row=args.nan_threshold_row,
        nan_threshold_col=args.nan_threshold_col,
        max_value_cap=args.max_value_cap,
        imputeNaN=args.impute_nans,
        min_nskel_per_video=args.min_nskel_per_video,
        min_nskel_sum=args.min_nskel_sum,
        drop_size_related_feats=args.drop_size_features,
        norm_feats_only=args.norm_features_only,
        percentile_to_use=args.percentile_to_use)

    assert not features.isna().sum(axis=1).any()
    assert not (features.std(axis=1) == 0).any()

    if ALL_WINDOWS:
        WINDOW_LIST = list(WINDOW_FRAME_DICT.keys())
        args.save_dir = Path(args.save_dir) / 'all_windows'

    perform_fast_effect_stats(features, metadata, WINDOW_LIST, args)
Example #5
0
def analyse_acute_rescue(features, 
                         metadata,
                         save_dir,
                         control_strain, 
                         control_antioxidant, 
                         control_window,
                         fdr_method='fdr_by',
                         pval_threshold=0.05,
                         remove_outliers=False):
 
    stats_dir =  Path(save_dir) / "Stats" / fdr_method
    plot_dir = Path(save_dir) / "Plots" / fdr_method

    strain_list = [control_strain] + [s for s in metadata['gene_name'].unique() if s != control_strain]  
    antiox_list = [control_antioxidant] + [a for a in metadata['antioxidant'].unique() if 
                                           a != control_antioxidant]
    window_list = [control_window] + [w for w in metadata['window'].unique() if w != control_window]

    # categorical variables to investigate: 'gene_name', 'antioxidant' and 'window'
    print("\nInvestigating difference in fraction of worms paused between hit strain and control " +
          "(for each window), in the presence/absence of antioxidants:\n")    

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=['gene_name', 'antioxidant', 'window'])
    print("Mean sample size of strain/antioxidant for each window: %d" %\
          (int(sample_size['n_samples'].mean())))
            
    # plot dates as different colours (in loop)
    date_lut = dict(zip(list(metadata['date_yyyymmdd'].unique()), 
                        sns.color_palette('Set1', n_colors=len(metadata['date_yyyymmdd'].unique()))))
        
    for strain in strain_list[1:]: # skip control_strain at first index postion        
        plot_meta = metadata[np.logical_or(metadata['gene_name']==strain, 
                                           metadata['gene_name']==control_strain)]
        plot_feat = features.reindex(plot_meta.index)
        plot_df = plot_meta.join(plot_feat[[FEATURE]])
        
        # Is there a difference between strain vs control at any window? (pooled antioxidant data)
        print("Plotting windows for %s vs control" % strain)
        plt.close('all')
        fig, ax = plt.subplots(figsize=((len(window_list) if len(window_list) >= 20 else 12),8))
        ax = sns.boxplot(x='window', y=FEATURE, hue='gene_name', hue_order=strain_list, order=window_list,
                         data=plot_df, palette='Set3', dodge=True, ax=ax)
        for date in date_lut.keys():
            date_df = plot_df[plot_df['date_yyyymmdd']==date]   
            ax = sns.stripplot(x='window', y=FEATURE, hue='gene_name', order=window_list,
                               hue_order=strain_list, data=date_df, 
                               palette={control_strain:date_lut[date], strain:date_lut[date]}, 
                               alpha=0.7, size=4, dodge=True, ax=ax)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
                
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('window')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(-0.02, max(y_bar) + 3 * max(IQR))
            
        # load t-test results + annotate p-values on plot
        for ii, window in enumerate(window_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\
                                '{}_window_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, str(window)]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == str(window)
            p_text = 'P<0.001' if p < 0.001 else 'P=%.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            plt.plot([ii-.3, ii-.3, ii+.3, ii+.3], 
                     [0.98, 0.99, 0.99, 0.98], #[y+h, y+2*h, y+2*h, y+h], 
                     lw=1.5, c='k', transform=trans)
            ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans,
                    rotation=(0 if len(window_list) <= 20 else 90))
            
        ax.set_xticks(range(len(window_list)+1))
        xlabels = [str(int(WINDOW_FRAME_DICT[w][0]/60)) for w in window_list]
        ax.set_xticklabels(xlabels)
        x_text = 'Time (minutes)' if ALL_WINDOWS else 'Time of bluelight 10-second burst (minutes)'
        ax.set_xlabel(x_text, fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        fig_savepath = plot_dir / 'window_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
    
    
        # Is there a difference between strain vs control for any antioxidant? (pooled window data)
        plt.close('all')
        fig, ax = plt.subplots(figsize=(10,8))
        ax = sns.boxplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df,
                          palette='Set3', dodge=True, order=antiox_list)
        ax = sns.swarmplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df,
                          color='k', alpha=0.7, size=4, dodge=True, order=antiox_list)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
        ax.set_xlabel('antioxidant', fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('antioxidant')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR))
            
        # annotate p-values
        for ii, antiox in enumerate(antiox_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\
                                '{}_antioxidant_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, antiox]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == antiox
            p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], 
                      [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], 
                      lw=1.5, c='k', transform=trans)
            ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                
        fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
        
    # Plot for each strain separately to see whether antioxidants had an effect at all
    for strain in strain_list:
            
        plt.close('all')
        fig, ax = plt.subplots(figsize=(10,8))
        ax = sns.boxplot(x='antioxidant', y=FEATURE, order=antiox_list, 
                         dodge=True, data=plot_df[plot_df['gene_name']==strain])
        ax = sns.swarmplot(x='antioxidant', y=FEATURE, order=antiox_list, 
                           dodge=True, data=plot_df[plot_df['gene_name']==strain],
                           alpha=0.7, size=4, color='k')        
        n_labs = len(plot_df['antioxidant'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
        ax.set_xlabel('antioxidant', fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('antioxidant')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(min(y_bar) - 1 * max(IQR), max(y_bar) + 2.5 * max(IQR))
            
        # annotate p-values
        for ii, antiox in enumerate(antiox_list):
            if antiox == control_antioxidant:
                continue
            # load antioxidant results for strain
            ttest_strain_path = stats_dir / 't-test_{}_antioxidant_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, antiox]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == antiox
            p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            #plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.98, 0.99, 0.98, 0.99], lw=1.5, c='k', transform=trans)
            ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                
        plt.title(strain, fontsize=18, pad=30)
        fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
        
        
    # Hierarchical Clustering Analysis
    #   - Clustermap of features by strain, to see if data cluster into groups
    #   - Control data is clustered first, feature order is stored and ordering applied to 
    #     full data for comparison
    
    # subset for Tierpsy top16 features only
    features = select_feat_set(features, tierpsy_set_name='tierpsy_16', append_bluelight=False)
    
    # Ensure no NaNs or features with zero standard deviation before normalisation
    assert not features.isna().sum(axis=0).any()
    assert not (features.std(axis=0) == 0).any()
       
    # Extract data for control
    control_feat_df = features[metadata['gene_name']==control_strain]
    control_meta_df = metadata.reindex(control_feat_df.index)
    
    control_feat_df, control_meta_df = clean_summary_results(features=control_feat_df,
                                                             metadata=control_meta_df,
                                                             imputeNaN=False)
    

    #zscores = (df-df.mean())/df.std() # minus mean, divide by std
    controlZ_feat_df = control_feat_df.apply(zscore, axis=0)

    # plot clustermap for control        
    control_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format(control_strain)
    cg = plot_clustermap(featZ=controlZ_feat_df,
                         meta=control_meta_df,
                         row_colours=True,
                         group_by=['gene_name','antioxidant'],
                         col_linkage=None,
                         method='complete',#[linkage, complete, average, weighted, centroid]
                         figsize=(20,10),
                         show_xlabels=True,
                         label_size=15,
                         sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                         saveto=control_clustermap_path,
                         bluelight_col_colours=False)

    # extract clustered feature order
    clustered_features = np.array(controlZ_feat_df.columns)[cg.dendrogram_col.reordered_ind]
     
    featZ_df = features.apply(zscore, axis=0)
    
    # Save stats table to CSV   
    # if not stats_path.exists():
    #     # Add z-normalised values
    #     z_stats = featZ_df.join(meta_df[GROUPING_VAR]).groupby(by=GROUPING_VAR).mean().T
    #     z_mean_cols = ['z-mean ' + v for v in z_stats.columns.to_list()]
    #     z_stats.columns = z_mean_cols
    #     stats_table = stats_table.join(z_stats)
    #     first_cols = [m for m in stats_table.columns if 'mean' in m]
    #     last_cols = [c for c in stats_table.columns if c not in first_cols]
    #     first_cols.extend(last_cols)
    #     stats_table = stats_table[first_cols].reset_index()
    #     first_cols.insert(0, 'feature')
    #     stats_table.columns = first_cols
    #     stats_table['feature'] = [' '.join(f.split('_')) for f in stats_table['feature']]
    #     stats_table = stats_table.sort_values(by='{} p-value'.format((T_TEST_NAME if 
    #                                  len(run_strain_list) == 2 else TEST_NAME)), ascending=True)
    #     stats_table_path = stats_dir / 'stats_summary_table.csv'
    #     stats_table.to_csv(stats_table_path, header=True, index=None)
    
    # Clustermap of full data - antioxidants  
    full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_antioxidant')
    _ = plot_clustermap(featZ=featZ_df,
                        meta=metadata, 
                        group_by=['gene_name','antioxidant'],
                        col_linkage=None,
                        method='complete',
                        figsize=(20,10),
                        show_xlabels=True,
                        label_size=15,
                        sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                        saveto=full_clustermap_path,
                        bluelight_col_colours=False)

    # Heatmap of strain/antioxidant treatment, ordered by control clustered feature order
    heatmap_date_path = plot_dir / 'heatmaps' / 'gene_antioxidant_heatmap.pdf'
    plot_barcode_heatmap(featZ=featZ_df[clustered_features], 
                         meta=metadata, 
                         group_by=['gene_name','antioxidant'], 
                         pvalues_series=None,
                         saveto=heatmap_date_path,
                         figsize=(20,6),
                         sns_colour_palette="Pastel1")    
      
    # Clustermap of full data - windows  
    full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_window')
    _ = plot_clustermap(featZ=featZ_df,
                        meta=metadata, 
                        group_by=['gene_name','window'],
                        col_linkage=None,
                        method='complete',
                        figsize=(20,10),
                        show_xlabels=True,
                        label_size=15,
                        sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                        saveto=full_clustermap_path,
                        bluelight_col_colours=False)
                  
    # Principal Components Analysis (PCA)

    if remove_outliers:
        outlier_path = plot_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, 
                                            features_to_analyse=None, 
                                            saveto=outlier_path)
        metadata = metadata.reindex(features.index)
        featZ_df = features.apply(zscore, axis=0)
  
    # project data + plot PCA
    #from tierpsytools.analysis.decomposition import plot_pca
    pca_dir = plot_dir / 'PCA'
    _ = plot_pca(featZ=featZ_df, 
                 meta=metadata, 
                 group_by='gene_name', 
                 n_dims=2,
                 control=control_strain,
                 var_subset=None, 
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 sns_colour_palette="Set1",
                 figsize=(12,8),
                 sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7},
                 legend_loc=[1.02,0.6],
                 hypercolor=False) 
         
    # t-distributed Stochastic Neighbour Embedding (tSNE)

    tsne_dir = plot_dir / 'tSNE'
    perplexities = [5,15,30] # NB: perplexity parameter should be roughly equal to group size
    
    _ = plot_tSNE(featZ=featZ_df,
                  meta=metadata,
                  group_by='gene_name',
                  var_subset=None,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=(8,8),
                  label_size=15,
                  size=20,
                  sns_colour_palette="Set1")
   
    # Uniform Manifold Projection (UMAP)

    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [5,15,30] # NB: n_neighbours parameter should be roughly equal to group size
    min_dist = 0.1 # Minimum distance parameter
    
    _ = plot_umap(featZ=featZ_df,
                  meta=metadata,
                  group_by='gene_name',
                  var_subset=None,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=(8,8),
                  label_size=15,
                  size=20,
                  sns_colour_palette="Set1")
    
    _ = plot_pca_2var(featZ=featZ_df, 
                      meta=metadata, 
                      var1='gene_name',
                      var2='antioxidant',
                      saveDir=pca_dir,
                      PCs_to_keep=10,
                      n_feats2print=10,
                      sns_colour_palette="Set1",
                      label_size=15,
                      figsize=[9,8],
                      sub_adj={'bottom':0,'left':0,'top':1,'right':1})

    return
def compare_strains_keio(features, metadata, args):
    """ Compare Keio single-gene deletion mutants with wild-type BW25113 control and look to see if 
        they signfiicantly alter N2 C. elegans behaviour while feeding.
        
        Subset results to omit selected strains (optional) 
        Inputs
        ------
        features, metadata : pd.DataFrame
            Matching features summaries and metadata
        
        args : Object 
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - omit_strains : list
            - grouping_variable : str
            - control_dict : dict
            - collapse_control : bool
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int
    """

    assert set(features.index) == set(metadata.index)

    # categorical variable to investigate, eg.'gene_name'
    grouping_var = args.grouping_variable
    n_strains = len(metadata[grouping_var].unique())
    assert n_strains == len(
        metadata[grouping_var].str.upper().unique())  # check case-sensitivity
    print("\nInvestigating '%s' variation (%d samples)" %
          (grouping_var, n_strains))

    # Subset results (rows) to omit selected strains
    if args.omit_strains is not None:
        features, metadata = subset_results(features,
                                            metadata,
                                            column=grouping_var,
                                            groups=args.omit_strains,
                                            omit=True)

    control = args.control_dict[grouping_var]  # control strain to use

    # Load Tierpsy Top feature set + subset (columns) for top feats only
    if args.n_top_feats is not None:
        top_feats_path = Path(
            args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(
                str(args.n_top_feats))
        topfeats = load_topfeats(top_feats_path,
                                 add_bluelight=True,
                                 remove_path_curvature=True,
                                 header=None)

        # Drop features that are not in results
        top_feats_list = [
            feat for feat in list(topfeats) if feat in features.columns
        ]
        features = features[top_feats_list]

    ##### Control variation #####

    control_metadata = metadata[metadata[grouping_var] == control]
    control_features = features.reindex(control_metadata.index)

    # Clean data after subset - to remove features with zero std
    control_feat_clean, control_meta_clean = clean_summary_results(
        control_features,
        control_metadata,
        max_value_cap=False,
        imputeNaN=False)
    if args.analyse_control:
        control_variation(control_feat_clean,
                          control_meta_clean,
                          args,
                          variables=[
                              k for k in args.control_dict.keys()
                              if k != grouping_var
                          ],
                          n_sig_features=10)

    if args.collapse_control:
        print("\nCollapsing control data (mean of each day)")
        features, metadata = average_plate_control_data(features, metadata)

    # Record mean sample size per group
    mean_sample_size = int(
        np.round(
            metadata.join(features).groupby([grouping_var],
                                            as_index=False).size().mean()))
    print("Mean sample size: %d" % mean_sample_size)

    save_dir = get_save_dir(args)
    stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method
    plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method

    ##### STATISTICS #####

    # =============================================================================
    #     ##### Pairplot Tierpsy Features - Pairwise correlation matrix #####
    #     if args.n_top_feats == 16:
    #         g = sns.pairplot(features, height=1.5)
    #         for ax in g.axes.flatten():
    #             # rotate x and y axis labels
    #             ax.set_xlabel(ax.get_xlabel(), rotation = 90)
    #             ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    #         plt.subplots_adjust(left=0.3, bottom=0.3)
    #         plt.show()
    # =============================================================================

    if not args.use_corrected_pvals:
        anova_path = stats_dir / '{}_results_uncorrected.csv'.format(args.test)
    else:
        anova_path = stats_dir / '{}_results.csv'.format(args.test)

    # load results + record significant features
    print("\nLoading statistics results")
    anova_table = pd.read_csv(anova_path, index_col=0)
    pvals = anova_table.sort_values(
        by='pvals', ascending=True)['pvals']  # rank features by p-value
    fset = pvals[pvals < args.pval_threshold].index.to_list()
    print(
        "\n%d significant features found by %s (P<%.2f, %s)" %
        (len(fset), args.test, args.pval_threshold,
         ('uncorrected' if not args.use_corrected_pvals else args.fdr_method)))

    ### k-significant features

    if len(fset) > 0:
        # Compare k sigfeat and ANOVA significant feature set overlap
        if not args.use_corrected_pvals:
            k_sigfeats_path = stats_dir / "k_significant_features_uncorrected.csv"
        else:
            k_sigfeats_path = stats_dir / "k_significant_features.csv"

        ksig_table = pd.read_csv(k_sigfeats_path, index_col=0)
        fset_ksig = ksig_table[
            ksig_table['pvals'] < args.pval_threshold].index.to_list()

        fset_overlap = set(fset).intersection(set(fset_ksig))
        prop_overlap = len(fset_overlap) / len(fset)
        print("%.1f%% overlap with k-significant features" %
              (prop_overlap * 100))

        if prop_overlap < 0.5 and len(fset) > 100:
            print(
                "WARNING: Inconsistency in statistics for feature set agreement between "
                + "%s and k significant features!" % args.test)

        if args.use_k_sig_feats_overlap:
            fset = list(ksig_table.loc[fset_overlap].sort_values(
                by='pvals', ascending=True).index)

        ### t-test

        t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney'  # aka. Wilcoxon rank-sum

        if not args.use_corrected_pvals:
            ttest_path = stats_dir / '{}_results_uncorrected.csv'.format(
                t_test)
        else:
            ttest_path = stats_dir / '{}_results.csv'.format(t_test)

        # read t-test results + record significant features (NOT ORDERED)
        ttest_table = pd.read_csv(ttest_path, index_col=0)
        pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]]
        pvals_t.columns = [c.split('pvals_')[-1] for c in pvals_t.columns]
        fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum(
            axis=1) > 0].index.to_list()
        print("%d significant features found by %s (P<%.2f, %s)" %
              (len(fset_ttest), t_test, args.pval_threshold,
               ('uncorrected'
                if not args.use_corrected_pvals else args.fdr_method)))

    else:
        print("No significant features found for %s by %s" %
              (grouping_var, args.test))

    ##### PLOTTING #####

    if len(fset) > 0:
        # Rank strains by number of sigfeats by t-test
        ranked_nsig = (pvals_t < args.pval_threshold).sum(axis=0).sort_values(
            ascending=False)
        # Select top hit strains by n sigfeats (select strains with > 5 sigfeats as hit strains?)
        hit_strains_nsig = ranked_nsig[ranked_nsig > 0].index.to_list()
        #hit_nuo = ranked_nsig[[i for i in ranked_nsig[ranked_nsig > 0].index if 'nuo' in i]]
        # if no sigfaets, subset for top strains ranked by lowest p-value by t-test for any feature
        print("%d significant strains (with 1 or more significant features)" %
              len(hit_strains_nsig))
        if len(hit_strains_nsig) > 0:
            write_list_to_file(hit_strains_nsig, stats_dir / 'hit_strains.txt')

        # Rank strains by lowest p-value for any feature
        ranked_pval = pvals_t.min(axis=0).sort_values(ascending=True)
        # Select top 100 hit strains by lowest p-value for any feature
        hit_strains_pval = ranked_pval[
            ranked_pval < args.pval_threshold].index.to_list()
        hit_strains_pval = ranked_pval.index[:N_LOWEST_PVAL].to_list()
        write_list_to_file(
            hit_strains_pval,
            stats_dir / 'lowest{}_pval.txt'.format(N_LOWEST_PVAL))

        print("\nPlotting ranked strains by number of significant features")
        ranked_nsig_path = plot_dir / (
            'ranked_number_sigfeats' + '_' +
            ('uncorrected' if args.fdr_method is None else args.fdr_method) +
            '.png')
        plt.ioff()
        plt.close('all')
        fig, ax = plt.subplots(figsize=(20, 6))
        ax.plot(ranked_nsig)
        if len(ranked_nsig.index) > 250:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(ranked_nsig.index.to_list(),
                               rotation=90,
                               fontsize=5)
        plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15)
        plt.savefig(ranked_nsig_path, dpi=600)

        print("Plotting ranked strains by lowest p-value of any feature")
        lowest_pval_path = plot_dir / (
            'ranked_lowest_pval' + '_' +
            ('uncorrected' if args.fdr_method is None else args.fdr_method) +
            '.png')
        plt.close('all')
        fig, ax = plt.subplots(figsize=(20, 6))
        ax.plot(ranked_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        if len(ranked_nsig.index) > 250:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(ranked_nsig.index.to_list(),
                               rotation=90,
                               fontsize=5)
        plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15)
        plt.savefig(lowest_pval_path, dpi=600)
        plt.close()

        print("\nMaking errorbar plots")
        errorbar_sigfeats(features,
                          metadata,
                          group_by=grouping_var,
                          fset=fset,
                          control=control,
                          rank_by='mean',
                          max_feats2plt=args.n_sig_features,
                          figsize=[20, 10],
                          fontsize=5,
                          ms=8,
                          elinewidth=1.5,
                          fmt='.',
                          tight_layout=[0.01, 0.01, 0.99, 0.99],
                          saveDir=plot_dir / 'errorbar')

        # =============================================================================
        #         print("Making boxplots")
        #         boxplots_grouped(feat_meta_df=metadata.join(features),
        #                           group_by=grouping_var,
        #                           control_group=control,
        #                           test_pvalues_df=(pvals_t.T if len(fset) > 0 else None),
        #                           feature_set=fset,
        #                           max_feats2plt=args.n_sig_features,
        #                           max_groups_plot_cap=None,
        #                           p_value_threshold=args.pval_threshold,
        #                           drop_insignificant=False,
        #                           sns_colour_palette="tab10",
        #                           figsize=[6,130],
        #                           saveDir=plot_dir / ('boxplots' + '_' + (
        #                                   'uncorrected' if args.fdr_method is None else args.fdr_method) +
        #                                   '.png'))
        # =============================================================================

        # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature
        if len(hit_strains_nsig) == 0:
            print(
                "\Saving lowest %d strains ranked by p-value for any feature" %
                N_LOWEST_PVAL)
            write_list_to_file(hit_strains_pval,
                               stats_dir / 'Top100_lowest_pval.txt')
            hit_strains = hit_strains_pval
        elif len(hit_strains_nsig) > 0:
            hit_strains = hit_strains_nsig

        # Individual boxplots of significant features by pairwise t-test (each group vs control)
        boxplots_sigfeats(
            features,
            y_class=metadata[grouping_var],
            control=control,
            pvals=pvals_t,
            z_class=metadata['date_yyyymmdd'],
            feature_set=None,
            saveDir=plot_dir / 'paired_boxplots',
            p_value_threshold=args.pval_threshold,
            drop_insignificant=True if len(hit_strains) > 0 else False,
            max_sig_feats=args.n_sig_features,
            max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None,
            sns_colour_palette="tab10",
            verbose=False)

        if SUBSET_HIT_STRAINS:
            strain_list = [control] + hit_strains[:TOP_N_HITS]
            print("Subsetting for Top%d hit strains" % (len(strain_list) - 1))
            features, metadata = subset_results(features,
                                                metadata,
                                                column=grouping_var,
                                                groups=strain_list,
                                                verbose=False)
        else:
            strain_list = list(metadata[grouping_var].unique())

# =============================================================================
#         # NOT NECESSARY FOR ALL STRAINS - LOOK AT CONTROL ONLY FOR THIS
#         # superplots of variation with respect to 'date_yyyymmdd'
#         print("\nPlotting superplots of date variation for significant features")
#         for feat in tqdm(fset[:args.n_sig_features]):
#             # plot day variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2=None,
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=False)
#             # plot run number vs day variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='imaging_run_number',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
#             # plot plate number variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='source_plate_id',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
#             # plot instrument name variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='instrument_name',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
# =============================================================================

# from tierpsytools.analysis.significant_features import plot_feature_boxplots
# plot_feature_boxplots(feat_to_plot=features,
#                       y_class=metadata[grouping_var],
#                       scores=pvals_t.rank(axis=1),
#                       pvalues=np.asarray(pvals_t).flatten(),
#                       saveto=None,
#                       close_after_plotting=True)

##### Hierarchical Clustering Analysis #####

# Z-normalise control data
    control_featZ = control_features.apply(zscore, axis=0)
    #featZ = (features-features.mean())/features.std() # minus mean, divide by std

    #from tierpsytools.preprocessing.scaling_class import scalingClass
    #scaler = scalingClass(scaling='standardize')
    #featZ = scaler.fit_transform(features)

    ### Control clustermap

    # control data is clustered and feature order is stored and applied to full data
    print("\nPlotting control clustermap")

    control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap.pdf'
    cg = plot_clustermap(
        control_featZ,
        control_metadata,
        group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else
                  [grouping_var, 'date_yyyymmdd']),
        method=METHOD,
        metric=METRIC,
        figsize=[20, 6],
        sub_adj={
            'bottom': 0.05,
            'left': 0,
            'top': 1,
            'right': 0.85
        },
        saveto=control_clustermap_path,
        label_size=15,
        show_xlabels=False)
    # control clustermap with labels
    if args.n_top_feats <= 256:
        control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap_label.pdf'
        cg = plot_clustermap(
            control_featZ,
            control_metadata,
            group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else
                      [grouping_var, 'date_yyyymmdd']),
            method=METHOD,
            metric=METRIC,
            figsize=[20, 10],
            sub_adj={
                'bottom': 0.5,
                'left': 0,
                'top': 1,
                'right': 0.85
            },
            saveto=control_clustermap_path,
            label_size=(15, 15),
            show_xlabels=True)

    #col_linkage = cg.dendrogram_col.calculated_linkage
    control_clustered_features = np.array(
        control_featZ.columns)[cg.dendrogram_col.reordered_ind]

    ### Full clustermap

    # Z-normalise data for all strains
    featZ = features.apply(zscore, axis=0)

    ## Save z-normalised values
    # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T
    # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()]
    # z_stats.to_csv(z_stats_path, header=True, index=None)

    # Clustermap of full data
    print("Plotting all strains clustermap")
    full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var +
                                                    '_clustermap.pdf')
    fg = plot_clustermap(featZ,
                         metadata,
                         group_by=grouping_var,
                         row_colours=None,
                         method=METHOD,
                         metric=METRIC,
                         figsize=[20, 30],
                         sub_adj={
                             'bottom': 0.01,
                             'left': 0,
                             'top': 1,
                             'right': 0.95
                         },
                         saveto=full_clustermap_path,
                         label_size=8,
                         show_xlabels=False)

    if args.n_top_feats <= 256:
        full_clustermap_path = plot_dir / 'heatmaps' / (
            grouping_var + '_clustermap_label.pdf')
        fg = plot_clustermap(featZ,
                             metadata,
                             group_by=grouping_var,
                             row_colours=None,
                             method=METHOD,
                             metric=METRIC,
                             figsize=[20, 40],
                             sub_adj={
                                 'bottom': 0.18,
                                 'left': 0,
                                 'top': 1,
                                 'right': 0.95
                             },
                             saveto=full_clustermap_path,
                             label_size=(15, 10),
                             show_xlabels=True)

    # clustered feature order for all strains
    _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind]

    pvals_heatmap = anova_table.loc[control_clustered_features, 'pvals']
    pvals_heatmap.name = 'P < {}'.format(args.pval_threshold)

    assert all(f in featZ.columns for f in pvals_heatmap.index)

    # Plot heatmap (averaged for each sample)
    if len(metadata[grouping_var].unique()) < 250:
        print("\nPlotting barcode heatmap")
        heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf')
        plot_barcode_heatmap(
            featZ=featZ[control_clustered_features],
            meta=metadata,
            group_by=[grouping_var],
            pvalues_series=pvals_heatmap,
            p_value_threshold=args.pval_threshold,
            selected_feats=None,  # fset if len(fset) > 0 else None
            saveto=heatmap_path,
            figsize=[20, 30],
            sns_colour_palette="Pastel1",
            label_size=10)

    ##### Principal Components Analysis #####

    pca_dir = plot_dir / 'PCA'

    # remove outlier samples from PCA
    if args.remove_outliers:
        outlier_path = pca_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, saveto=outlier_path)
        metadata = metadata.reindex(features.index)  # reindex metadata
        featZ = features.apply(zscore, axis=0)  # re-normalise data

        # Drop features with NaN values after normalising
        n_cols = len(featZ.columns)
        featZ.dropna(axis=1, inplace=True)
        n_dropped = n_cols - len(featZ.columns)
        if n_dropped > 0:
            print("Dropped %d features after normalisation (NaN)" % n_dropped)

    coloured_strains_pca = [control] + hit_strains[:15]
    coloured_strains_pca = [
        s for s in coloured_strains_pca
        if s in metadata[grouping_var].unique()
    ]

    #from tierpsytools.analysis.decomposition import plot_pca
    _ = plot_pca(featZ,
                 metadata,
                 group_by=grouping_var,
                 control=control,
                 var_subset=coloured_strains_pca,
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 sns_colour_palette="plasma",
                 n_dims=2,
                 label_size=8,
                 sub_adj={
                     'bottom': 0.13,
                     'left': 0.13,
                     'top': 0.95,
                     'right': 0.88
                 },
                 legend_loc=[1.02, 0.6],
                 hypercolor=False)

    # add details of COG category information to metadata
    # (using hard-coded dict of info from Baba et al. 2006 paper)
    metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict)

    # plot pca coloured by Keio COG category
    _ = plot_pca(featZ,
                 metadata,
                 group_by='COG_category',
                 control=None,
                 var_subset=list(metadata['COG_category'].dropna().unique()),
                 saveDir=pca_dir / 'COG',
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 n_dims=2,
                 hypercolor=False,
                 label_size=8,
                 figsize=[12, 8],
                 sub_adj={
                     'bottom': 0.1,
                     'left': 0.1,
                     'top': 0.95,
                     'right': 0.7
                 },
                 legend_loc=[1.02, 0.6],
                 sns_colour_palette="plasma")

    ##### t-distributed Stochastic Neighbour Embedding #####

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    _ = plot_tSNE(featZ,
                  metadata,
                  group_by=grouping_var,
                  var_subset=coloured_strains_pca,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    _ = plot_tSNE(featZ,
                  metadata,
                  group_by='COG_category',
                  var_subset=list(metadata['COG_category'].dropna().unique()),
                  saveDir=tsne_dir / 'COG_category',
                  perplexities=perplexities,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")

    ##### Uniform Manifold Projection #####

    print("\nPerforming UMAP")
    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    min_dist = 0.1  # Minimum distance parameter
    _ = plot_umap(featZ,
                  metadata,
                  group_by=grouping_var,
                  var_subset=coloured_strains_pca,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")
    metadata = pd.read_csv(METADATA_PATH, dtype={'comments':str, 'source_plate_id':str})
    
    # Subset for control data only
    control_strain = args.control_dict[args.grouping_variable] # control strain to use
    control_features, control_metadata = subset_results(features, metadata, 
                                                        column=args.grouping_variable,
                                                        groups=[control_strain])
    # Subset for imaging dates of interest    
    if args.dates is not None:
        dates = [int(d) for d in args.dates]
        control_features, control_metadata = subset_results(control_features, control_metadata, 
                                                            column='date_yyyymmdd', groups=dates)

    # Clean data after subset - to remove features with zero std
    control_features, control_metadata = clean_summary_results(control_features, 
                                                               control_metadata, 
                                                               max_value_cap=False,
                                                               imputeNaN=False)
    
    # Load Tierpsy Top feature set + subset (columns) for top feats only
    if args.n_top_feats is not None:
        top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats))
        topfeats = load_topfeats(top_feats_path, add_bluelight=True, 
                                 remove_path_curvature=True, header=None)
        
        # Drop features that are not in results
        top_feats_list = [feat for feat in list(topfeats) if feat in control_features.columns]
        control_features = control_features[top_feats_list]

    print("Investigating variation in '%s' (control %s)" % (control_strain, args.grouping_variable))
    control_variation(control_features, 
                      control_metadata,