Example #1
0
def perform_fast_effect_stats(features, metadata, window_list, args):
    """ Pairwise t-tests for each window comparing worm 'motion mode paused fraction' on 
        Keio mutants vs BW control 
    """

    # categorical variables to investigate: 'gene_name' and 'window'
    print(
        "\nInvestigating variation in fraction of worms paused between hit strains and control "
        + "(for each window)")

    # assert there will be no errors due to case-sensitivity
    assert len(metadata['gene_name'].unique()) == len(
        metadata['gene_name'].str.upper().unique())

    # subset for windows in window_frame_dict
    assert all(w in metadata['window'] for w in window_list)
    metadata = metadata[metadata['window'].isin(window_list)]
    features = features.reindex(metadata.index)

    control_strain = args.control_dict['gene_name']
    strain_list = list(
        [s for s in metadata['gene_name'].unique() if s != control_strain])

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=['gene_name', 'window'])
    print("Mean sample size of strain/window: %d" %
          (int(sample_size['n_samples'].mean())))

    # construct save paths (args.save_dir / topfeats? etc)
    save_dir = get_save_dir(args)
    stats_dir = save_dir / "Stats" / args.fdr_method

    control_meta = metadata[metadata['gene_name'] == control_strain]
    control_feat = features.reindex(control_meta.index)
    control_df = control_meta.join(control_feat[[FEATURE]])

    for strain in strain_list:
        print(
            "\nPairwise t-tests for each window comparing fraction of worms paused "
            + "on %s vs control" % strain)
        strain_meta = metadata[metadata['gene_name'] == strain]
        strain_feat = features.reindex(strain_meta.index)
        strain_df = strain_meta.join(strain_feat[[FEATURE]])

        stats, pvals, reject = pairwise_ttest(control_df,
                                              strain_df,
                                              feature_list=[FEATURE],
                                              group_by='window',
                                              fdr_method=args.fdr_method,
                                              fdr=0.05)

        # compile table of results
        stats.columns = ['stats_' + str(c) for c in stats.columns]
        pvals.columns = ['pvals_' + str(c) for c in pvals.columns]
        reject.columns = ['reject_' + str(c) for c in reject.columns]
        test_results = pd.concat([stats, pvals, reject], axis=1)

        # save results
        ttest_strain_path = stats_dir / 'pairwise_ttests' / '{}_window_results.csv'.format(
            strain)
        ttest_strain_path.parent.mkdir(parents=True, exist_ok=True)
        test_results.to_csv(ttest_strain_path, header=True, index=True)

        for window in window_list:
            print("%s difference in '%s' between %s vs %s in window %s (paired t-test, P=%.3f, %s)" %\
                  (("SIGNIFICANT" if reject.loc[FEATURE, 'reject_{}'.format(window)] else "No"),
                  FEATURE, strain, control_strain, window, pvals.loc[FEATURE, 'pvals_{}'.format(window)],
                  args.fdr_method))

    return
Example #2
0
def dead_keio_stats(features, metadata, args):
    """ Perform statistical analyses on dead Keio experiment results:
        - t-tests for each feature comparing each strain vs control for paired antioxidant treatment conditions
        - t-tests for each feature comparing each strain antioxidant treatment to negative control (no antioxidant)
        
        Inputs
        ------
        features, metadata : pd.DataFrame
            Clean feature summaries and accompanying metadata
        
        args : Object
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - control_dict : dict
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int           
    """

    print("\nInvestigating variation in worm behaviour on dead vs alive hit Keio strains")  

    # assert there will be no errors due to case-sensitivity
    assert len(metadata[STRAIN_COLNAME].unique()) == len(metadata[STRAIN_COLNAME].str.upper().unique())
    assert all(type(b) == np.bool_ for b in metadata[TREATMENT_COLNAME].unique())
    
    # Load Tierpsy feature set + subset (columns) for selected features only
    features = select_feat_set(features, 'tierpsy_{}'.format(args.n_top_feats), append_bluelight=True)
    features = features[[f for f in features.columns if 'path_curvature' not in f]]
    
    assert not features.isna().any().any()
    #n_feats = features.shape[1]
    
    strain_list = list(metadata[STRAIN_COLNAME].unique())
    assert CONTROL_STRAIN in strain_list

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME])
    print("Mean sample size of %s: %d" % (STRAIN_COLNAME, int(sample_size['n_samples'].mean())))
    
    # construct save paths (args.save_dir / topfeats? etc)
    save_dir = get_save_dir(args)
    stats_dir =  save_dir / "Stats" / args.fdr_method 
    
    ##### ANOVA #####

    # make path to save ANOVA results
    test_path = stats_dir / 'ANOVA_results.csv'
    test_path.parent.mkdir(exist_ok=True, parents=True)

    # ANOVA across strains for significant feature differences
    if len(metadata[STRAIN_COLNAME].unique()) > 2:   
        stats, pvals, reject = univariate_tests(X=features, 
                                                y=metadata[STRAIN_COLNAME], 
                                                test='ANOVA',
                                                control=CONTROL_STRAIN,
                                                comparison_type='multiclass',
                                                multitest_correction=None, # uncorrected
                                                alpha=args.pval_threshold,
                                                n_permutation_test=None) # 'all'
    
        # get effect sizes
        effect_sizes = get_effect_sizes(X=features, 
                                        y=metadata[STRAIN_COLNAME], 
                                        control=CONTROL_STRAIN,
                                        effect_type=None,
                                        linked_test='ANOVA')
    
        # correct for multiple comparisons
        reject_corrected, pvals_corrected = _multitest_correct(pvals, 
                                                               multitest_method=args.fdr_method,
                                                               fdr=args.pval_threshold)
                                    
        # compile + save results (corrected)
        test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1)
        test_results.columns = ['stats','effect_size','pvals','reject']     
        test_results['significance'] = sig_asterix(test_results['pvals'])
        test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals
        test_results.to_csv(test_path, header=True, index=True)
        
        nsig = test_results['reject'].sum()
        print("%d features (%.f%%) signficantly different among '%s'" % (nsig, 
              len(test_results.index)/nsig, STRAIN_COLNAME))

    
    ##### t-tests #####

    for strain in strain_list:                                   
        strain_meta = metadata[metadata[STRAIN_COLNAME]==strain]
        strain_feat = features.reindex(strain_meta.index)
                     
        ### t-tests for each feature comparing live vs dead behaviour
    
        ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format((t_test + '_' + strain))
        ttest_path = stats_dir / '{}_results.csv'.format((t_test + '_' + strain))  
        ttest_path.parent.mkdir(exist_ok=True, parents=True)

        # perform t-tests (without correction for multiple testing)
        stats_t, pvals_t, reject_t = univariate_tests(X=strain_feat, 
                                                      y=strain_meta[TREATMENT_COLNAME], 
                                                      control=CONTROL_TREATMENT, 
                                                      test=t_test,
                                                      comparison_type='binary_each_group',
                                                      multitest_correction=None, 
                                                      alpha=0.05)
        # get effect sizes for comparisons
        effect_sizes_t =  get_effect_sizes(X=strain_feat, 
                                           y=strain_meta[TREATMENT_COLNAME], 
                                           control=CONTROL_TREATMENT,
                                           effect_type=None,
                                           linked_test=t_test)
        
        # compile + save t-test results (uncorrected)
        stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
        pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
        reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
        effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
        ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
        ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
        
        # correct for multiple comparisons
        pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
        reject_t, pvals_t = _multitest_correct(pvals_t, 
                                               multitest_method=args.fdr_method,
                                               fdr=args.pval_threshold)

        # compile + save t-test results (corrected)
        pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
        reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
        ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
        ttest_corrected.to_csv(ttest_path, header=True, index=True)

        # record t-test significant features (not ordered)
        fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
        #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
        print("%d significant features for %s on any %s vs %s (%s, %s, P<%.2f)" % (len(fset_ttest),
              strain, TREATMENT_COLNAME, CONTROL_TREATMENT, t_test, args.fdr_method, args.pval_threshold))

        if len(fset_ttest) > 0:
            ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format((t_test + '_' + strain))
            write_list_to_file(fset_ttest, ttest_sigfeats_path)

    ##### for LIVE bacteria: compare each strain with control #####
    
    live_metadata = metadata[metadata['dead']==False]
    live_features = features.reindex(live_metadata.index)    

    ttest_path_uncorrected = stats_dir / '{}_live_uncorrected.csv'.format(t_test)
    ttest_path = stats_dir / '{}_live_results.csv'.format(t_test)
    ttest_path.parent.mkdir(exist_ok=True, parents=True)
    
    # perform t-tests (without correction for multiple testing)   
    stats_t, pvals_t, reject_t = univariate_tests(X=live_features, 
                                                  y=live_metadata[STRAIN_COLNAME], 
                                                  control=CONTROL_STRAIN, 
                                                  test=t_test,
                                                  comparison_type='binary_each_group',
                                                  multitest_correction=None, 
                                                  alpha=0.05)
    
    # get effect sizes for comparisons
    effect_sizes_t =  get_effect_sizes(X=live_features, 
                                       y=live_metadata[STRAIN_COLNAME], 
                                       control=CONTROL_STRAIN,
                                       effect_type=None,
                                       linked_test=t_test)
    
    # compile + save t-test results (uncorrected)
    stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
    ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
    
    # correct for multiple comparisons
    pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
    reject_t, pvals_t = _multitest_correct(pvals_t, 
                                           multitest_method=args.fdr_method,
                                           fdr=args.pval_threshold)

    # compile + save t-test results (corrected)
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_corrected.to_csv(ttest_path, header=True, index=True)

    # record t-test significant features (not ordered)
    fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
    #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
    print("LIVE BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\
          (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, 
           args.pval_threshold))

    if len(fset_ttest) > 0:
        ttest_sigfeats_path = stats_dir / '{}_live_sigfeats.txt'.format(t_test)
        write_list_to_file(fset_ttest, ttest_sigfeats_path)

    ##### for DEAD bacteria: compare each strain with control #####
    
    dead_metadata = metadata[metadata['dead']==True]
    dead_features = features.reindex(dead_metadata.index)    

    ttest_path_uncorrected = stats_dir / '{}_dead_uncorrected.csv'.format(t_test)
    ttest_path = stats_dir / '{}_dead_results.csv'.format(t_test)
    ttest_path.parent.mkdir(exist_ok=True, parents=True)
    
    # perform t-tests (without correction for multiple testing)   
    stats_t, pvals_t, reject_t = univariate_tests(X=dead_features, 
                                                  y=dead_metadata[STRAIN_COLNAME], 
                                                  control=CONTROL_STRAIN, 
                                                  test=t_test,
                                                  comparison_type='binary_each_group',
                                                  multitest_correction=None, 
                                                  alpha=0.05)
    
    # get effect sizes for comparisons
    effect_sizes_t =  get_effect_sizes(X=dead_features, 
                                       y=dead_metadata[STRAIN_COLNAME], 
                                       control=CONTROL_STRAIN,
                                       effect_type=None,
                                       linked_test=t_test)
    
    # compile + save t-test results (uncorrected)
    stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
    ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
    
    # correct for multiple comparisons
    pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
    reject_t, pvals_t = _multitest_correct(pvals_t, 
                                           multitest_method=args.fdr_method,
                                           fdr=args.pval_threshold)

    # compile + save t-test results (corrected)
    pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
    reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
    ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
    ttest_corrected.to_csv(ttest_path, header=True, index=True)

    # record t-test significant features (not ordered)
    fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
    #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
    print("DEAD BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\
          (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, 
           args.pval_threshold))

    if len(fset_ttest) > 0:
        ttest_sigfeats_path = stats_dir / '{}_dead_sigfeats.txt'.format(t_test)
        write_list_to_file(fset_ttest, ttest_sigfeats_path)  
Example #3
0
    
    # assert there will be no errors due to case-sensitivity
    assert len(metadata['gene_name'].unique()) == len(metadata['gene_name'].str.upper().unique())
    assert len(metadata['antioxidant'].unique()) == len(metadata['antioxidant'].str.upper().unique())

    if ALL_WINDOWS:
        WINDOW_LIST = list(WINDOW_FRAME_DICT.keys())
        args.save_dir = Path(args.save_dir) / 'all_windows'
     
    # subset for windows in window_frame_dict
    assert all(w in metadata['window'] for w in WINDOW_LIST)
    metadata = metadata[metadata['window'].isin(WINDOW_LIST)]
    features = features.reindex(metadata.index)

    # statistics save path
    save_dir = get_save_dir(args)
    
    acute_rescue_stats(features, 
                       metadata, 
                       save_dir=save_dir, 
                       control_strain=args.control_dict['gene_name'],
                       control_antioxidant=args.control_dict['antioxidant'],
                       control_window=args.control_dict['window'],
                       fdr_method='fdr_by',
                       pval_threshold=args.pval_threshold)
    
    analyse_acute_rescue(features, 
                         metadata, 
                         save_dir=save_dir,
                         control_strain=args.control_dict['gene_name'],
                         control_antioxidant=args.control_dict['antioxidant'],
Example #4
0
def analyse_fast_effect(features, metadata, window_list, args):

    # categorical variables to investigate: 'gene_name' and 'window'
    print(
        "\nInvestigating variation in fraction of worms paused between hit strains and control "
        + "(for each window)")

    # assert there will be no errors due to case-sensitivity
    assert len(metadata['gene_name'].unique()) == len(
        metadata['gene_name'].str.upper().unique())

    # subset for windows in window_frame_dict
    assert all(w in metadata['window'] for w in window_list)
    metadata = metadata[metadata['window'].isin(window_list)]
    features = features.reindex(metadata.index)

    control_strain = args.control_dict['gene_name']
    strain_list = list(
        [s for s in metadata['gene_name'].unique() if s != control_strain])

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=['gene_name', 'window'])
    print("Mean sample size of strain/window: %d" %
          (int(sample_size['n_samples'].mean())))

    # construct save paths (args.save_dir / topfeats? etc)
    save_dir = get_save_dir(args)
    stats_dir = save_dir / "Stats" / args.fdr_method
    plot_dir = save_dir / "Plots" / args.fdr_method

    # plot dates as different colours (in loop)
    date_lut = dict(
        zip(
            list(metadata['date_yyyymmdd'].unique()),
            sns.color_palette('Set1',
                              n_colors=len(
                                  metadata['date_yyyymmdd'].unique()))))

    for strain in strain_list:
        print("Plotting windows for %s vs control" % strain)

        plot_meta = metadata[np.logical_or(
            metadata['gene_name'] == strain,
            metadata['gene_name'] == control_strain)]
        plot_feat = features.reindex(plot_meta.index)
        plot_df = plot_meta.join(plot_feat[[FEATURE]])

        # plot control/strain for all windows
        plt.close('all')
        fig, ax = plt.subplots(
            figsize=((len(window_list) if len(window_list) >= 20 else 12), 8))
        ax = sns.boxplot(x='window',
                         y=FEATURE,
                         hue='gene_name',
                         hue_order=[control_strain, strain],
                         data=plot_df,
                         palette='Set3',
                         dodge=True,
                         ax=ax)
        for date in date_lut.keys():
            date_df = plot_df[plot_df['date_yyyymmdd'] == date]
            ax = sns.stripplot(x='window',
                               y=FEATURE,
                               hue='gene_name',
                               hue_order=[control_strain, strain],
                               data=date_df,
                               palette={
                                   control_strain: date_lut[date],
                                   strain: date_lut[date]
                               },
                               alpha=0.7,
                               size=4,
                               dodge=True,
                               ax=ax)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs],
                  labels[:n_labs],
                  fontsize=15,
                  frameon=False,
                  loc='upper right')

        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('window')
            y_bar = grouped_strain[FEATURE].median(
            )  # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(-0.02, max(y_bar) + 3 * max(IQR))

        # # add bluelight windows to plot
        # if ALL_WINDOWS:
        #     bluelight_times = [WINDOW_FRAME_DICT[w] for w in WINDOW_LIST]
        #     # rescale window times to box plot positions: (xi – min(x)) / (max(x) – min(x)) * n_boxes
        #     n_boxes = len(WINDOW_FRAME_DICT.keys())
        #     ax = add_bluelight_to_plot(ax, bluelight_times, alpha=0.5)

        # load t-test results + annotate p-values on plot
        for ii, window in enumerate(window_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / '{}_window_results.csv'.format(
                strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path,
                                             index_col=0,
                                             header=0)
            strain_pvals_t = ttest_strain_table[[
                c for c in ttest_strain_table if "pvals_" in c
            ]]
            strain_pvals_t.columns = [
                c.split('pvals_')[-1] for c in strain_pvals_t.columns
            ]
            p = strain_pvals_t.loc[FEATURE, str(window)]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == str(window)
            p_text = 'P<0.001' if p < 0.001 else 'P=%.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(
                ax.transData, ax.transAxes)
            plt.plot(
                [ii - .3, ii - .3, ii + .3, ii + .3],
                [0.98, 0.99, 0.99, 0.98],  #[y+h, y+2*h, y+2*h, y+h], 
                lw=1.5,
                c='k',
                transform=trans)
            ax.text(ii,
                    1.01,
                    p_text,
                    fontsize=9,
                    ha='center',
                    va='bottom',
                    transform=trans,
                    rotation=(0 if len(window_list) <= 20 else 90))

        ax.set_xticks(range(len(window_list) + 1))
        xlabels = [str(int(WINDOW_FRAME_DICT[w][0] / 60)) for w in window_list]
        ax.set_xticklabels(xlabels)
        x_text = 'Time (minutes)' if ALL_WINDOWS else 'Time of bluelight 10-second burst (minutes)'
        ax.set_xlabel(x_text, fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_', ' '), fontsize=15, labelpad=10)

        fig_savepath = plot_dir / 'window_boxplots' / strain / (FEATURE +
                                                                '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)

    return
def compare_strains_keio(features, metadata, args):
    """ Compare Keio single-gene deletion mutants with wild-type BW25113 control and look to see if 
        they signfiicantly alter N2 C. elegans behaviour while feeding.
        
        Subset results to omit selected strains (optional) 
        Inputs
        ------
        features, metadata : pd.DataFrame
            Matching features summaries and metadata
        
        args : Object 
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - omit_strains : list
            - grouping_variable : str
            - control_dict : dict
            - collapse_control : bool
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int
    """

    assert set(features.index) == set(metadata.index)

    # categorical variable to investigate, eg.'gene_name'
    grouping_var = args.grouping_variable
    n_strains = len(metadata[grouping_var].unique())
    assert n_strains == len(
        metadata[grouping_var].str.upper().unique())  # check case-sensitivity
    print("\nInvestigating '%s' variation (%d samples)" %
          (grouping_var, n_strains))

    # Subset results (rows) to omit selected strains
    if args.omit_strains is not None:
        features, metadata = subset_results(features,
                                            metadata,
                                            column=grouping_var,
                                            groups=args.omit_strains,
                                            omit=True)

    control = args.control_dict[grouping_var]  # control strain to use

    # Load Tierpsy Top feature set + subset (columns) for top feats only
    if args.n_top_feats is not None:
        top_feats_path = Path(
            args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(
                str(args.n_top_feats))
        topfeats = load_topfeats(top_feats_path,
                                 add_bluelight=True,
                                 remove_path_curvature=True,
                                 header=None)

        # Drop features that are not in results
        top_feats_list = [
            feat for feat in list(topfeats) if feat in features.columns
        ]
        features = features[top_feats_list]

    ##### Control variation #####

    control_metadata = metadata[metadata[grouping_var] == control]
    control_features = features.reindex(control_metadata.index)

    # Clean data after subset - to remove features with zero std
    control_feat_clean, control_meta_clean = clean_summary_results(
        control_features,
        control_metadata,
        max_value_cap=False,
        imputeNaN=False)
    if args.analyse_control:
        control_variation(control_feat_clean,
                          control_meta_clean,
                          args,
                          variables=[
                              k for k in args.control_dict.keys()
                              if k != grouping_var
                          ],
                          n_sig_features=10)

    if args.collapse_control:
        print("\nCollapsing control data (mean of each day)")
        features, metadata = average_plate_control_data(features, metadata)

    # Record mean sample size per group
    mean_sample_size = int(
        np.round(
            metadata.join(features).groupby([grouping_var],
                                            as_index=False).size().mean()))
    print("Mean sample size: %d" % mean_sample_size)

    save_dir = get_save_dir(args)
    stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method
    plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method

    ##### STATISTICS #####

    # =============================================================================
    #     ##### Pairplot Tierpsy Features - Pairwise correlation matrix #####
    #     if args.n_top_feats == 16:
    #         g = sns.pairplot(features, height=1.5)
    #         for ax in g.axes.flatten():
    #             # rotate x and y axis labels
    #             ax.set_xlabel(ax.get_xlabel(), rotation = 90)
    #             ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    #         plt.subplots_adjust(left=0.3, bottom=0.3)
    #         plt.show()
    # =============================================================================

    if not args.use_corrected_pvals:
        anova_path = stats_dir / '{}_results_uncorrected.csv'.format(args.test)
    else:
        anova_path = stats_dir / '{}_results.csv'.format(args.test)

    # load results + record significant features
    print("\nLoading statistics results")
    anova_table = pd.read_csv(anova_path, index_col=0)
    pvals = anova_table.sort_values(
        by='pvals', ascending=True)['pvals']  # rank features by p-value
    fset = pvals[pvals < args.pval_threshold].index.to_list()
    print(
        "\n%d significant features found by %s (P<%.2f, %s)" %
        (len(fset), args.test, args.pval_threshold,
         ('uncorrected' if not args.use_corrected_pvals else args.fdr_method)))

    ### k-significant features

    if len(fset) > 0:
        # Compare k sigfeat and ANOVA significant feature set overlap
        if not args.use_corrected_pvals:
            k_sigfeats_path = stats_dir / "k_significant_features_uncorrected.csv"
        else:
            k_sigfeats_path = stats_dir / "k_significant_features.csv"

        ksig_table = pd.read_csv(k_sigfeats_path, index_col=0)
        fset_ksig = ksig_table[
            ksig_table['pvals'] < args.pval_threshold].index.to_list()

        fset_overlap = set(fset).intersection(set(fset_ksig))
        prop_overlap = len(fset_overlap) / len(fset)
        print("%.1f%% overlap with k-significant features" %
              (prop_overlap * 100))

        if prop_overlap < 0.5 and len(fset) > 100:
            print(
                "WARNING: Inconsistency in statistics for feature set agreement between "
                + "%s and k significant features!" % args.test)

        if args.use_k_sig_feats_overlap:
            fset = list(ksig_table.loc[fset_overlap].sort_values(
                by='pvals', ascending=True).index)

        ### t-test

        t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney'  # aka. Wilcoxon rank-sum

        if not args.use_corrected_pvals:
            ttest_path = stats_dir / '{}_results_uncorrected.csv'.format(
                t_test)
        else:
            ttest_path = stats_dir / '{}_results.csv'.format(t_test)

        # read t-test results + record significant features (NOT ORDERED)
        ttest_table = pd.read_csv(ttest_path, index_col=0)
        pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]]
        pvals_t.columns = [c.split('pvals_')[-1] for c in pvals_t.columns]
        fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum(
            axis=1) > 0].index.to_list()
        print("%d significant features found by %s (P<%.2f, %s)" %
              (len(fset_ttest), t_test, args.pval_threshold,
               ('uncorrected'
                if not args.use_corrected_pvals else args.fdr_method)))

    else:
        print("No significant features found for %s by %s" %
              (grouping_var, args.test))

    ##### PLOTTING #####

    if len(fset) > 0:
        # Rank strains by number of sigfeats by t-test
        ranked_nsig = (pvals_t < args.pval_threshold).sum(axis=0).sort_values(
            ascending=False)
        # Select top hit strains by n sigfeats (select strains with > 5 sigfeats as hit strains?)
        hit_strains_nsig = ranked_nsig[ranked_nsig > 0].index.to_list()
        #hit_nuo = ranked_nsig[[i for i in ranked_nsig[ranked_nsig > 0].index if 'nuo' in i]]
        # if no sigfaets, subset for top strains ranked by lowest p-value by t-test for any feature
        print("%d significant strains (with 1 or more significant features)" %
              len(hit_strains_nsig))
        if len(hit_strains_nsig) > 0:
            write_list_to_file(hit_strains_nsig, stats_dir / 'hit_strains.txt')

        # Rank strains by lowest p-value for any feature
        ranked_pval = pvals_t.min(axis=0).sort_values(ascending=True)
        # Select top 100 hit strains by lowest p-value for any feature
        hit_strains_pval = ranked_pval[
            ranked_pval < args.pval_threshold].index.to_list()
        hit_strains_pval = ranked_pval.index[:N_LOWEST_PVAL].to_list()
        write_list_to_file(
            hit_strains_pval,
            stats_dir / 'lowest{}_pval.txt'.format(N_LOWEST_PVAL))

        print("\nPlotting ranked strains by number of significant features")
        ranked_nsig_path = plot_dir / (
            'ranked_number_sigfeats' + '_' +
            ('uncorrected' if args.fdr_method is None else args.fdr_method) +
            '.png')
        plt.ioff()
        plt.close('all')
        fig, ax = plt.subplots(figsize=(20, 6))
        ax.plot(ranked_nsig)
        if len(ranked_nsig.index) > 250:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(ranked_nsig.index.to_list(),
                               rotation=90,
                               fontsize=5)
        plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15)
        plt.savefig(ranked_nsig_path, dpi=600)

        print("Plotting ranked strains by lowest p-value of any feature")
        lowest_pval_path = plot_dir / (
            'ranked_lowest_pval' + '_' +
            ('uncorrected' if args.fdr_method is None else args.fdr_method) +
            '.png')
        plt.close('all')
        fig, ax = plt.subplots(figsize=(20, 6))
        ax.plot(ranked_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        if len(ranked_nsig.index) > 250:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(ranked_nsig.index.to_list(),
                               rotation=90,
                               fontsize=5)
        plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15)
        plt.savefig(lowest_pval_path, dpi=600)
        plt.close()

        print("\nMaking errorbar plots")
        errorbar_sigfeats(features,
                          metadata,
                          group_by=grouping_var,
                          fset=fset,
                          control=control,
                          rank_by='mean',
                          max_feats2plt=args.n_sig_features,
                          figsize=[20, 10],
                          fontsize=5,
                          ms=8,
                          elinewidth=1.5,
                          fmt='.',
                          tight_layout=[0.01, 0.01, 0.99, 0.99],
                          saveDir=plot_dir / 'errorbar')

        # =============================================================================
        #         print("Making boxplots")
        #         boxplots_grouped(feat_meta_df=metadata.join(features),
        #                           group_by=grouping_var,
        #                           control_group=control,
        #                           test_pvalues_df=(pvals_t.T if len(fset) > 0 else None),
        #                           feature_set=fset,
        #                           max_feats2plt=args.n_sig_features,
        #                           max_groups_plot_cap=None,
        #                           p_value_threshold=args.pval_threshold,
        #                           drop_insignificant=False,
        #                           sns_colour_palette="tab10",
        #                           figsize=[6,130],
        #                           saveDir=plot_dir / ('boxplots' + '_' + (
        #                                   'uncorrected' if args.fdr_method is None else args.fdr_method) +
        #                                   '.png'))
        # =============================================================================

        # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature
        if len(hit_strains_nsig) == 0:
            print(
                "\Saving lowest %d strains ranked by p-value for any feature" %
                N_LOWEST_PVAL)
            write_list_to_file(hit_strains_pval,
                               stats_dir / 'Top100_lowest_pval.txt')
            hit_strains = hit_strains_pval
        elif len(hit_strains_nsig) > 0:
            hit_strains = hit_strains_nsig

        # Individual boxplots of significant features by pairwise t-test (each group vs control)
        boxplots_sigfeats(
            features,
            y_class=metadata[grouping_var],
            control=control,
            pvals=pvals_t,
            z_class=metadata['date_yyyymmdd'],
            feature_set=None,
            saveDir=plot_dir / 'paired_boxplots',
            p_value_threshold=args.pval_threshold,
            drop_insignificant=True if len(hit_strains) > 0 else False,
            max_sig_feats=args.n_sig_features,
            max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None,
            sns_colour_palette="tab10",
            verbose=False)

        if SUBSET_HIT_STRAINS:
            strain_list = [control] + hit_strains[:TOP_N_HITS]
            print("Subsetting for Top%d hit strains" % (len(strain_list) - 1))
            features, metadata = subset_results(features,
                                                metadata,
                                                column=grouping_var,
                                                groups=strain_list,
                                                verbose=False)
        else:
            strain_list = list(metadata[grouping_var].unique())

# =============================================================================
#         # NOT NECESSARY FOR ALL STRAINS - LOOK AT CONTROL ONLY FOR THIS
#         # superplots of variation with respect to 'date_yyyymmdd'
#         print("\nPlotting superplots of date variation for significant features")
#         for feat in tqdm(fset[:args.n_sig_features]):
#             # plot day variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2=None,
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=False)
#             # plot run number vs day variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='imaging_run_number',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
#             # plot plate number variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='source_plate_id',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
#             # plot instrument name variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='instrument_name',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
# =============================================================================

# from tierpsytools.analysis.significant_features import plot_feature_boxplots
# plot_feature_boxplots(feat_to_plot=features,
#                       y_class=metadata[grouping_var],
#                       scores=pvals_t.rank(axis=1),
#                       pvalues=np.asarray(pvals_t).flatten(),
#                       saveto=None,
#                       close_after_plotting=True)

##### Hierarchical Clustering Analysis #####

# Z-normalise control data
    control_featZ = control_features.apply(zscore, axis=0)
    #featZ = (features-features.mean())/features.std() # minus mean, divide by std

    #from tierpsytools.preprocessing.scaling_class import scalingClass
    #scaler = scalingClass(scaling='standardize')
    #featZ = scaler.fit_transform(features)

    ### Control clustermap

    # control data is clustered and feature order is stored and applied to full data
    print("\nPlotting control clustermap")

    control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap.pdf'
    cg = plot_clustermap(
        control_featZ,
        control_metadata,
        group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else
                  [grouping_var, 'date_yyyymmdd']),
        method=METHOD,
        metric=METRIC,
        figsize=[20, 6],
        sub_adj={
            'bottom': 0.05,
            'left': 0,
            'top': 1,
            'right': 0.85
        },
        saveto=control_clustermap_path,
        label_size=15,
        show_xlabels=False)
    # control clustermap with labels
    if args.n_top_feats <= 256:
        control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap_label.pdf'
        cg = plot_clustermap(
            control_featZ,
            control_metadata,
            group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else
                      [grouping_var, 'date_yyyymmdd']),
            method=METHOD,
            metric=METRIC,
            figsize=[20, 10],
            sub_adj={
                'bottom': 0.5,
                'left': 0,
                'top': 1,
                'right': 0.85
            },
            saveto=control_clustermap_path,
            label_size=(15, 15),
            show_xlabels=True)

    #col_linkage = cg.dendrogram_col.calculated_linkage
    control_clustered_features = np.array(
        control_featZ.columns)[cg.dendrogram_col.reordered_ind]

    ### Full clustermap

    # Z-normalise data for all strains
    featZ = features.apply(zscore, axis=0)

    ## Save z-normalised values
    # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T
    # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()]
    # z_stats.to_csv(z_stats_path, header=True, index=None)

    # Clustermap of full data
    print("Plotting all strains clustermap")
    full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var +
                                                    '_clustermap.pdf')
    fg = plot_clustermap(featZ,
                         metadata,
                         group_by=grouping_var,
                         row_colours=None,
                         method=METHOD,
                         metric=METRIC,
                         figsize=[20, 30],
                         sub_adj={
                             'bottom': 0.01,
                             'left': 0,
                             'top': 1,
                             'right': 0.95
                         },
                         saveto=full_clustermap_path,
                         label_size=8,
                         show_xlabels=False)

    if args.n_top_feats <= 256:
        full_clustermap_path = plot_dir / 'heatmaps' / (
            grouping_var + '_clustermap_label.pdf')
        fg = plot_clustermap(featZ,
                             metadata,
                             group_by=grouping_var,
                             row_colours=None,
                             method=METHOD,
                             metric=METRIC,
                             figsize=[20, 40],
                             sub_adj={
                                 'bottom': 0.18,
                                 'left': 0,
                                 'top': 1,
                                 'right': 0.95
                             },
                             saveto=full_clustermap_path,
                             label_size=(15, 10),
                             show_xlabels=True)

    # clustered feature order for all strains
    _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind]

    pvals_heatmap = anova_table.loc[control_clustered_features, 'pvals']
    pvals_heatmap.name = 'P < {}'.format(args.pval_threshold)

    assert all(f in featZ.columns for f in pvals_heatmap.index)

    # Plot heatmap (averaged for each sample)
    if len(metadata[grouping_var].unique()) < 250:
        print("\nPlotting barcode heatmap")
        heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf')
        plot_barcode_heatmap(
            featZ=featZ[control_clustered_features],
            meta=metadata,
            group_by=[grouping_var],
            pvalues_series=pvals_heatmap,
            p_value_threshold=args.pval_threshold,
            selected_feats=None,  # fset if len(fset) > 0 else None
            saveto=heatmap_path,
            figsize=[20, 30],
            sns_colour_palette="Pastel1",
            label_size=10)

    ##### Principal Components Analysis #####

    pca_dir = plot_dir / 'PCA'

    # remove outlier samples from PCA
    if args.remove_outliers:
        outlier_path = pca_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, saveto=outlier_path)
        metadata = metadata.reindex(features.index)  # reindex metadata
        featZ = features.apply(zscore, axis=0)  # re-normalise data

        # Drop features with NaN values after normalising
        n_cols = len(featZ.columns)
        featZ.dropna(axis=1, inplace=True)
        n_dropped = n_cols - len(featZ.columns)
        if n_dropped > 0:
            print("Dropped %d features after normalisation (NaN)" % n_dropped)

    coloured_strains_pca = [control] + hit_strains[:15]
    coloured_strains_pca = [
        s for s in coloured_strains_pca
        if s in metadata[grouping_var].unique()
    ]

    #from tierpsytools.analysis.decomposition import plot_pca
    _ = plot_pca(featZ,
                 metadata,
                 group_by=grouping_var,
                 control=control,
                 var_subset=coloured_strains_pca,
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 sns_colour_palette="plasma",
                 n_dims=2,
                 label_size=8,
                 sub_adj={
                     'bottom': 0.13,
                     'left': 0.13,
                     'top': 0.95,
                     'right': 0.88
                 },
                 legend_loc=[1.02, 0.6],
                 hypercolor=False)

    # add details of COG category information to metadata
    # (using hard-coded dict of info from Baba et al. 2006 paper)
    metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict)

    # plot pca coloured by Keio COG category
    _ = plot_pca(featZ,
                 metadata,
                 group_by='COG_category',
                 control=None,
                 var_subset=list(metadata['COG_category'].dropna().unique()),
                 saveDir=pca_dir / 'COG',
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 n_dims=2,
                 hypercolor=False,
                 label_size=8,
                 figsize=[12, 8],
                 sub_adj={
                     'bottom': 0.1,
                     'left': 0.1,
                     'top': 0.95,
                     'right': 0.7
                 },
                 legend_loc=[1.02, 0.6],
                 sns_colour_palette="plasma")

    ##### t-distributed Stochastic Neighbour Embedding #####

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    _ = plot_tSNE(featZ,
                  metadata,
                  group_by=grouping_var,
                  var_subset=coloured_strains_pca,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    _ = plot_tSNE(featZ,
                  metadata,
                  group_by='COG_category',
                  var_subset=list(metadata['COG_category'].dropna().unique()),
                  saveDir=tsne_dir / 'COG_category',
                  perplexities=perplexities,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")

    ##### Uniform Manifold Projection #####

    print("\nPerforming UMAP")
    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    min_dist = 0.1  # Minimum distance parameter
    _ = plot_umap(featZ,
                  metadata,
                  group_by=grouping_var,
                  var_subset=coloured_strains_pca,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")
Example #6
0
def compare_keio_rescue(features, metadata, args):
    """ Compare Keio single-gene deletion mutants with wild-type BW25113 control under different 
        antioxidant treatment conditions, and look to see if the addition of antioxidants can rescue
        the worm phenotype on these mutant strains, effectively bringing the worms back to wild-type
        behaviour.
        
        - Plot boxplots for each strain, comparing each pairwise antioxidant condition vs the control
          (for all features)
        - 
        
        Inputs
        ------
        features, metadata : pd.DataFrame
            Matching features summaries and metadata
        
        args : Object 
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - omit_strains : list
            - control_dict : dict
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int
    """

    assert set(features.index) == set(metadata.index)

    strain_list = list(metadata[STRAIN_COLNAME].unique())
    antioxidant_list = list(metadata[TREATMENT_COLNAME].unique())
    assert CONTROL_STRAIN in strain_list and CONTROL_TREATMENT in antioxidant_list
    strain_list = [CONTROL_STRAIN] + [s for s in sorted(strain_list) if s != CONTROL_STRAIN]
    antioxidant_list = [CONTROL_TREATMENT] + [a for a in sorted(antioxidant_list) if a != CONTROL_TREATMENT]
    n_strains = len(strain_list)
    n_antiox = len(antioxidant_list)
    
    # assert there will be no errors due to case-sensitivity
    assert len(metadata[STRAIN_COLNAME].unique()) == len(metadata[STRAIN_COLNAME].str.upper().unique())
    assert len(metadata[TREATMENT_COLNAME].unique()) == len(metadata[TREATMENT_COLNAME].str.upper().unique())
    
    assert not features.isna().any().any()
    n_feats = features.shape[1]

    # construct save paths
    save_dir = get_save_dir(args)
    stats_dir =  save_dir / "Stats" / args.fdr_method
    plot_dir = save_dir / "Plots" / args.fdr_method
    plot_dir.mkdir(exist_ok=True, parents=True)

    # Print mean sample size
    sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME])
    ss_savepath = save_dir / 'sample_sizes.csv'
    sample_size.to_csv(ss_savepath, index=False)  

    # add combined treatment column (for heatmap/PCA)
    metadata['treatment_combination'] = [str(s) + '_' + str(a) for s, a in 
                                         zip(metadata[STRAIN_COLNAME], metadata[TREATMENT_COLNAME])]

    # Subset for control data for strain and for treatment
    control_strain_meta = metadata[metadata[STRAIN_COLNAME] == CONTROL_STRAIN]
    control_strain_feat = features.reindex(control_strain_meta.index)
    
    #control_antiox_meta = metadata[metadata[TREATMENT_COLNAME] == CONTROL_TREATMENT]
    #control_antiox_feat = features.reindex(control_antiox_meta.index)
                
# =============================================================================
#     ##### Control variation ##### 
#     # Clean data after subset - to remove features with zero std
#     control_strain_feat, control_strain_meta = clean_summary_results(control_strain_feat, 
#                                                                      control_strain_meta, 
#                                                                      max_value_cap=False,
#                                                                      imputeNaN=False)  
#     
#     control_antiox_feat, control_antiox_meta = clean_summary_results(control_antiox_feat, 
#                                                                      control_antiox_meta, 
#                                                                      max_value_cap=False,
#                                                                      imputeNaN=False)                   
#     if args.analyse_control:
#         control_variation(control_strain_feat, control_strain_meta, args,
#                           variables=[TREATMENT_COLNAME], n_sig_features=10)
#         control_variation(control_antiox_feat, control_antiox_meta, args,
#                           variables=[STRAIN_COLNAME], n_sig_features=10)                            
# =============================================================================
       
    print("\nComparing %d %ss with %d %s treatments for %d features" %\
          (n_strains, STRAIN_COLNAME, n_antiox, TREATMENT_COLNAME, n_feats))

    t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum

    ##### FOR EACH STRAIN #####
    
    for strain in tqdm(strain_list[1:]):
        print("\nPlotting results for %s:" % strain)
        strain_meta = metadata[metadata[STRAIN_COLNAME]==strain]
        strain_feat = features.reindex(strain_meta.index)
        
        # Load ANOVA results for strain
        if not args.use_corrected_pvals:
            anova_strain_path = stats_dir / '{}_uncorrected.csv'.format((args.test + '_' + strain))
        else:
            anova_strain_path = stats_dir / '{}_results.csv'.format((args.test + '_' + strain))
        anova_strain_table = pd.read_csv(anova_strain_path, index_col=0)            
        strain_pvals = anova_strain_table.sort_values(by='pvals', ascending=True)['pvals'] # rank features by p-value
        #strain_fset = strain_pvals[strain_pvals < args.pval_threshold].index.to_list()  
        
        # load antioxidant t-test results
        ttest_strain_path = stats_dir / 'pairwise_ttests' / '{}_results.csv'.format(strain + 
                            "_vs_" + CONTROL_STRAIN)
        ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0)
        strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
        strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns]       
        #strain_fset_t = strain_pvals_t[(strain_pvals_t < args.pval_threshold).sum(axis=1) > 0].index.to_list()
           
        # Plot ranked n significant features by t-test for each antioxidant treatment
        ranked_antiox_nsig = (strain_pvals_t < args.pval_threshold).sum(axis=0).sort_values(ascending=False)
        ranked_antiox_nsig_path = plot_dir / ('{}_ranked_number_sigfeats_'.format(strain) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots() #figsize=(20,6)
        ax.plot(ranked_antiox_nsig)
        ax.set_xticklabels(ranked_antiox_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Antioxidant (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(ranked_antiox_nsig_path, dpi=600)
        
        # Plot ranked lowest pval by t-test for each antioxidant treatment
        ranked_antiox_pval = strain_pvals_t.min(axis=0).sort_values(ascending=True)
        lowest_antiox_pval_path = plot_dir / ('{}_ranked_lowest_pval_'.format(strain) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots()
        ax.plot(ranked_antiox_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        ax.set_xticklabels(ranked_antiox_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Antioxidant (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(lowest_antiox_pval_path, dpi=600)
        plt.close()

# =============================================================================
#         print("\nMaking errorbar plots")
#         errorbar_sigfeats(strain_feat, strain_meta, 
#                           group_by=TREATMENT_COLNAME, 
#                           fset=strain_pvals.index, 
#                           control=CONTROL_TREATMENT, 
#                           rank_by='mean',
#                           max_feats2plt=args.n_sig_features, 
#                           figsize=[20,10], 
#                           fontsize=15,
#                           ms=20,
#                           elinewidth=7,
#                           fmt='.',
#                           tight_layout=[0.01,0.01,0.99,0.99],
#                           saveDir=plot_dir / 'errorbar' / strain)
# =============================================================================
                
        if strain != CONTROL_STRAIN:
            # stick together length-wise
            plot_meta = pd.concat([control_strain_meta, strain_meta], ignore_index=True)
            plot_feat = pd.concat([control_strain_feat, strain_feat], ignore_index=True)
            # stick together width-wise
            plot_df = plot_meta.join(plot_feat)
    
            # Plot boxplots for top 10 features comparing strain vs wild-type for each antioxidant treatment            
            for f, feature in enumerate(tqdm(strain_pvals.index)):
                            
                plt.close('all')
                fig, ax = plt.subplots(figsize=(10,8))
                ax = sns.boxplot(x=TREATMENT_COLNAME, y=feature, hue=STRAIN_COLNAME, data=plot_df,
                                 palette='Set3', dodge=True, order=antioxidant_list)
                ax = sns.swarmplot(x=TREATMENT_COLNAME, y=feature, hue=STRAIN_COLNAME, data=plot_df,
                                 color='k', alpha=0.7, size=4, dodge=True, order=antioxidant_list)
                n_labs = len(plot_df[STRAIN_COLNAME].unique())
                handles, labels = ax.get_legend_handles_labels()
                ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
                ax.set_xlabel(TREATMENT_COLNAME, fontsize=15, labelpad=10)
                ax.set_ylabel(feature.replace('_',' '), fontsize=15, labelpad=10)
                
                # scale plot to omit outliers (>2.5*IQR from mean)
                if scale_outliers_box:
                    grouped_strain = plot_df.groupby('antioxidant')
                    y_bar = grouped_strain[feature].median() # median is less skewed by outliers
                    # Computing IQR
                    Q1 = grouped_strain[feature].quantile(0.25)
                    Q3 = grouped_strain[feature].quantile(0.75)
                    IQR = Q3 - Q1
                    plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR))
                    
                # annotate p-values
                for ii, antiox in enumerate(antioxidant_list):
                    try:
                        p = strain_pvals_t.loc[feature, antiox]
                        text = ax.get_xticklabels()[ii]
                        assert text.get_text() == antiox
                        p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
                        #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
                        #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
                        trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
                        plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], 
                                 [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], 
                                 lw=1.5, c='k', transform=trans)
                        ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                    except Exception as e:
                        print(e)
                        
                fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / ('{}_'.format(f+1) + 
                                                                             feature + '.png')
                fig_savepath.parent.mkdir(parents=True, exist_ok=True)
                plt.savefig(fig_savepath)
            
    ##### FOR EACH ANTIOXIDANT #####
    
    for antiox in antioxidant_list:
        print("\nPlotting results for %s:" % antiox)
        #antiox_meta = metadata[metadata[TREATMENT_COLNAME]==antiox]
        #antiox_feat = features.reindex(antiox_meta.index)

        # Load ANOVA results for antioxidant
        # if not args.use_corrected_pvals:
        #     anova_antiox_path = stats_dir / '{}_uncorrected.csv'.format((args.test + '_' + antiox))
        # else:
        #     anova_antiox_path = stats_dir / '{}_results.csv'.format((args.test + '_' + antiox))
        #anova_antiox_table = pd.read_csv(anova_antiox_path, index_col=0)            
        #antiox_pvals = anova_antiox_table.sort_values(by='pvals', ascending=True)['pvals'] # rank features by p-value
        #antiox_fset = antiox_pvals[antiox_pvals < args.pval_threshold].index.to_list()  
        
        # Load t-test results
        if not args.use_corrected_pvals:
            ttest_antiox_path = stats_dir / '{}_uncorrected.csv'.format((t_test + '_' + antiox))
        else:
            ttest_antiox_path = stats_dir / '{}_results.csv'.format((t_test + '_' + antiox))
        ttest_antiox_table = pd.read_csv(ttest_antiox_path, index_col=0)
        antiox_pvals_t = ttest_antiox_table[[c for c in ttest_antiox_table if "pvals_" in c]] 
        antiox_pvals_t.columns = [c.split('pvals_')[-1] for c in antiox_pvals_t.columns]       
        #antiox_fset_t = antiox_pvals_t[(antiox_pvals_t < args.pval_threshold).sum(axis=1) > 0].index.to_list()
     
        # Plot ranked n significant features by t-test for each strain
        ranked_strain_nsig = (antiox_pvals_t < args.pval_threshold).sum(axis=0).sort_values(ascending=False)
        ranked_strain_nsig_path = plot_dir / ('{}_ranked_number_sigfeats_'.format(antiox) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots() #figsize=(20,6)
        ax.plot(ranked_strain_nsig)
        ax.set_xticklabels(ranked_strain_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Strain (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(ranked_strain_nsig_path, dpi=600)
        
        # Plot ranked lowest pval by t-test for each antioxidant treatment
        ranked_strain_pval = antiox_pvals_t.min(axis=0).sort_values(ascending=True)
        lowest_strain_pval_path = plot_dir / ('{}_ranked_lowest_pval_'.format(antiox) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots()
        ax.plot(ranked_strain_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        ax.set_xticklabels(ranked_strain_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Strain (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(lowest_strain_pval_path, dpi=600)
        plt.close()
        
        # Plot boxplots for top 10 features comparing antioxidant vs None for each strain
        # TODO

# =============================================================================
#         print("Making boxplots")
#         boxplots_grouped(feat_meta_df=metadata.join(features), 
#                           group_by=grouping_var,
#                           control_group=control,
#                           test_pvalues_df=(pvals_t.T if len(fset) > 0 else None), 
#                           feature_set=fset,
#                           max_feats2plt=args.n_sig_features, 
#                           max_groups_plot_cap=None,
#                           p_value_threshold=args.pval_threshold,
#                           drop_insignificant=False,
#                           sns_colour_palette="tab10",
#                           figsize=[6,130], 
#                           saveDir=plot_dir / ('boxplots' + '_' + (
#                                   'uncorrected' if args.fdr_method is None else args.fdr_method) + 
#                                   '.png'))
# =============================================================================

        # # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature    
        # if len(hit_strains_nsig) == 0:
        #     print("\Saving lowest %d strains ranked by p-value for any feature" % N_LOWEST_PVAL)
        #     write_list_to_file(hit_strains_pval, stats_dir / 'Top100_lowest_pval.txt')
        #     hit_strains = hit_strains_pval
        # elif len(hit_strains_nsig) > 0:
        #     hit_strains = hit_strains_nsig

        # # Individual boxplots of significant features by pairwise t-test (each group vs control)
        # boxplots_sigfeats(features,
        #                   y_class=metadata[grouping_var],
        #                   control=control,
        #                   pvals=pvals_t, 
        #                   z_class=metadata['date_yyyymmdd'],
        #                   feature_set=None,
        #                   saveDir=plot_dir / 'paired_boxplots',
        #                   p_value_threshold=args.pval_threshold,
        #                   drop_insignificant=True if len(hit_strains) > 0 else False,
        #                   max_sig_feats=args.n_sig_features,
        #                   max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None,
        #                   sns_colour_palette="tab10",
        #                   verbose=False)
        
        # if SUBSET_HIT_STRAINS:
        #     strain_list = [control] + hit_strains[:TOP_N_HITS]
        #     print("Subsetting for Top%d hit strains" % (len(strain_list)-1))
        #     features, metadata = subset_results(features, metadata, column=grouping_var,
        #                                         groups=strain_list, verbose=False)   
        # else:
        #     strain_list = list(metadata[grouping_var].unique())
                   
    ##### Hierarchical Clustering Analysis #####

    # Z-normalise control data
    control_strain_featZ = control_strain_feat.apply(zscore, axis=0)
    
    ### Control clustermap
    
    # control data is clustered and feature order is stored and applied to full data
    print("\nPlotting clustermap for %s control" % CONTROL_STRAIN)
    
    control_clustermap_path = plot_dir / 'heatmaps' / 'control_clustermap.pdf'
    cg = plot_clustermap(control_strain_featZ, control_strain_meta,
                         group_by='treatment_combination',
                         method=METHOD, 
                         metric=METRIC,
                         figsize=[20,6],
                         sub_adj={'bottom':0.05,'left':0,'top':1,'right':0.85},
                         saveto=control_clustermap_path,
                         label_size=15,
                         show_xlabels=False)
    # control clustermap with labels
    if args.n_top_feats <= 256:
        control_clustermap_path = plot_dir / 'heatmaps' / 'control_clustermap_label.pdf'
        cg = plot_clustermap(control_strain_featZ, control_strain_meta,
                             group_by='treatment_combination',
                             method=METHOD, 
                             metric=METRIC,
                             figsize=[20,10],
                             sub_adj={'bottom':0.7,'left':0,'top':1,'right':0.85},
                             saveto=control_clustermap_path,
                             label_size=(15,15),
                             show_xlabels=True)

    #col_linkage = cg.dendrogram_col.calculated_linkage
    control_clustered_features = np.array(control_strain_featZ.columns)[cg.dendrogram_col.reordered_ind]

    # ### Full clustermap 
    # TODO: all strains, for each treatment
    # TODO: all treatments, for each strain
    # all strains/treatments together
    
    # Z-normalise data for all strains
    featZ = features.apply(zscore, axis=0)
                    
    ## Save z-normalised values
    # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T
    # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()]
    # z_stats.to_csv(z_stats_path, header=True, index=None)
    
    # Clustermap of full data   
    print("Plotting all strains clustermap")    
    full_clustermap_path = plot_dir / 'heatmaps' / 'full_clustermap.pdf'
    fg = plot_clustermap(featZ, metadata, 
                         group_by='treatment_combination',
                         row_colours=None,
                         method=METHOD, 
                         metric=METRIC,
                         figsize=[20,30],
                         sub_adj={'bottom':0.01,'left':0,'top':1,'right':0.95},
                         saveto=full_clustermap_path,
                         label_size=8,
                         show_xlabels=False)
    if args.n_top_feats <= 256:
        full_clustermap_path = plot_dir / 'heatmaps' / 'full_clustermap_label.pdf'
        fg = plot_clustermap(featZ, metadata, 
                             group_by='treatment_combination',
                             row_colours=None,
                             method=METHOD, 
                             metric=METRIC,
                             figsize=[20,40],
                             sub_adj={'bottom':0.18,'left':0,'top':1,'right':0.95},
                             saveto=full_clustermap_path,
                             label_size=(15,10),
                             show_xlabels=True)
    
    # clustered feature order for all strains
    _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind]
    
    pvals_heatmap = anova_strain_table.loc[control_clustered_features, 'pvals']
    pvals_heatmap.name = 'P < {}'.format(args.pval_threshold)

    assert all(f in featZ.columns for f in pvals_heatmap.index)
            
    # Plot heatmap (averaged for each sample)
    if len(metadata['treatment_combination'].unique()) < 250:
        print("\nPlotting barcode heatmap")
        heatmap_path = plot_dir / 'heatmaps' / 'full_heatmap.pdf'
        plot_barcode_heatmap(featZ=featZ[control_clustered_features], 
                             meta=metadata, 
                             group_by='treatment_combination', 
                             pvalues_series=pvals_heatmap,
                             p_value_threshold=args.pval_threshold,
                             selected_feats=None, # fset if len(fset) > 0 else None
                             saveto=heatmap_path,
                             figsize=[20,30],
                             sns_colour_palette="Pastel1",
                             label_size=15,
                             sub_adj={'top':0.95,'bottom':0.01,'left':0.15,'right':0.92})        
                    
    # ##### Principal Components Analysis #####

    pca_dir = plot_dir / 'PCA'
    
    # remove outlier samples from PCA
    if args.remove_outliers:
        outlier_path = pca_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, saveto=outlier_path)
        metadata = metadata.reindex(features.index) # reindex metadata
        featZ = features.apply(zscore, axis=0) # re-normalise data

        # Drop features with NaN values after normalising
        n_cols = len(featZ.columns)
        featZ.dropna(axis=1, inplace=True)
        n_dropped = n_cols - len(featZ.columns)
        if n_dropped > 0:
            print("Dropped %d features after normalisation (NaN)" % n_dropped)

    # plot PCA 
    # Total of 50 treatment combinations, so plot fes/fepD/entA/wild_type only
    treatment_list = sorted(list(metadata['treatment_combination'].unique()))
    treatment_subset = [i for i in treatment_list if i.split('_')[0] in ['fes','fepD','entA','wild']]
    _ = plot_pca(featZ, metadata, 
                 group_by='treatment_combination', 
                 control=CONTROL_STRAIN + '_' + CONTROL_TREATMENT,
                 var_subset=treatment_subset, 
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 sns_colour_palette="plasma",
                 n_dims=2,
                 label_size=8,
                 sub_adj={'bottom':0.13,'left':0.13,'top':0.95,'right':0.88},
                 legend_loc=[1.02,0.6],
                 hypercolor=False)

    # add details of COG category information to metadata 
    # (using hard-coded dict of info from Baba et al. 2006 paper)
    metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict)
    
    # plot pca coloured by Keio COG category    
    _ = plot_pca(featZ, metadata, 
                 group_by='COG_category', 
                 control=None,
                 var_subset=list(metadata['COG_category'].dropna().unique()), 
                 saveDir=pca_dir / 'COG',
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 n_dims=2,
                 hypercolor=False,
                 label_size=8,
                 figsize=[12,8],
                 sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7},
                 legend_loc=[1.02,0.6],
                 sns_colour_palette="plasma")

    ##### t-distributed Stochastic Neighbour Embedding #####   
    mean_sample_size = int(sample_size['n_samples'].mean())

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size] # NB: should be roughly equal to group size    
    _ = plot_tSNE(featZ, metadata,
                  group_by='treatment_combination',
                  var_subset=treatment_subset,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=[8,8],
                  label_size=7,
                  marker_size=30,
                  sns_colour_palette="plasma")
    
    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size] # NB: should be roughly equal to group size    
    _ = plot_tSNE(featZ, metadata,
                  group_by='COG_category',
                  var_subset=list(metadata['COG_category'].dropna().unique()),
                  saveDir=tsne_dir / 'COG_category',
                  perplexities=perplexities,
                  figsize=[8,8],
                  label_size=7,
                  marker_size=30,
                  sns_colour_palette="plasma")

    ##### Uniform Manifold Projection #####  
    
    print("\nPerforming UMAP")
    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [mean_sample_size] # NB: should be roughly equal to group size
    min_dist = 0.1 # Minimum distance parameter    
    _ = plot_umap(featZ, metadata,
                  group_by='treatment_combination',
                  var_subset=treatment_subset,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=[8,8],
                  label_size=7,
                  marker_size=30,
                  sns_colour_palette="plasma")   
def antioxidant_stats(features, metadata, args):
    """ Perform statistical analyses on Keio antioxidant rescue experiment results:
        - ANOVA tests for significant feature variation between strains (for each antioxidant treatment in turn)
        - ANOVA tests for significant feature variation in antioxidant treatment (for each strain in turn)
        - t-tests for each feature comparing each strain vs control for paired antioxidant treatment conditions
        - t-tests for each feature comparing each strain antioxidant treatment to negative control (no antioxidant)
        
        Inputs
        ------
        features, metadata : pd.DataFrame
            Clean feature summaries and accompanying metadata
        
        args : Object
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - control_dict : dict
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int           
    """

    # categorical variables to investigate: 'gene_name' and 'antioxidant'
    print(
        "\nInvestigating variation in worm behaviour on hit strains treated with different antioxidants"
    )

    # assert there will be no errors due to case-sensitivity
    assert len(metadata[STRAIN_COLNAME].unique()) == len(
        metadata[STRAIN_COLNAME].str.upper().unique())
    assert len(metadata[TREATMENT_COLNAME].unique()) == len(
        metadata[TREATMENT_COLNAME].str.upper().unique())

    assert not features.isna().any().any()

    strain_list = list(metadata[STRAIN_COLNAME].unique())
    antioxidant_list = list(metadata[TREATMENT_COLNAME].unique())
    assert CONTROL_STRAIN in strain_list and CONTROL_TREATMENT in antioxidant_list

    # print mean sample size
    sample_size = df_summary_stats(metadata,
                                   columns=[STRAIN_COLNAME, TREATMENT_COLNAME])
    print("Mean sample size of %s: %d" %
          (STRAIN_COLNAME, int(sample_size['n_samples'].mean())))

    # construct save paths (args.save_dir / topfeats? etc)
    save_dir = get_save_dir(args)
    stats_dir = save_dir / "Stats" / args.fdr_method

    ### For each antioxidant treatment in turn...

    for antiox in antioxidant_list:
        print("\n%s" % antiox)
        meta_antiox = metadata[metadata[TREATMENT_COLNAME] == antiox]
        feat_antiox = features.reindex(meta_antiox.index)

        ### ANOVA tests for significant variation between strains

        # make path to save ANOVA results
        test_path_unncorrected = stats_dir / '{}_uncorrected.csv'.format(
            (args.test + '_' + antiox))
        test_path = stats_dir / '{}_results.csv'.format(
            (args.test + '_' + antiox))
        test_path.parent.mkdir(exist_ok=True, parents=True)

        if len(meta_antiox[STRAIN_COLNAME].unique()) > 2:
            # perform ANOVA + record results before & after correcting for multiple comparisons
            stats, pvals, reject = univariate_tests(
                X=feat_antiox,
                y=meta_antiox[STRAIN_COLNAME],
                test=args.test,
                control=CONTROL_STRAIN,
                comparison_type='multiclass',
                multitest_correction=None,  # uncorrected
                alpha=args.pval_threshold,
                n_permutation_test=None)  # 'all'

            # get effect sizes
            effect_sizes = get_effect_sizes(X=feat_antiox,
                                            y=meta_antiox[STRAIN_COLNAME],
                                            control=CONTROL_STRAIN,
                                            effect_type=None,
                                            linked_test=args.test)

            # compile + save results (uncorrected)
            test_results = pd.concat([stats, effect_sizes, pvals, reject],
                                     axis=1)
            test_results.columns = ['stats', 'effect_size', 'pvals', 'reject']
            test_results['significance'] = sig_asterix(test_results['pvals'])
            test_results = test_results.sort_values(
                by=['pvals'], ascending=True)  # rank pvals
            test_results.to_csv(test_path_unncorrected,
                                header=True,
                                index=True)

            # correct for multiple comparisons
            reject_corrected, pvals_corrected = _multitest_correct(
                pvals,
                multitest_method=args.fdr_method,
                fdr=args.pval_threshold)

            # compile + save results (corrected)
            test_results = pd.concat(
                [stats, effect_sizes, pvals_corrected, reject_corrected],
                axis=1)
            test_results.columns = ['stats', 'effect_size', 'pvals', 'reject']
            test_results['significance'] = sig_asterix(test_results['pvals'])
            test_results = test_results.sort_values(
                by=['pvals'], ascending=True)  # rank pvals
            test_results.to_csv(test_path, header=True, index=True)

            print(
                "%s differences in '%s' across strains on %s (%s, P<%.2f, %s)"
                % (("SIGNIFICANT" if reject_corrected.loc[FEATURE, args.test]
                    else "No significant"), FEATURE, antiox, args.test,
                   args.pval_threshold, args.fdr_method))
        else:
            print("\nWARNING: Not enough %s groups for %s (n=%d)" %\
                  (STRAIN_COLNAME, args.test, len(strain_list)))

        ### t-tests comparing each strain vs control for each antioxidant treatment conditions

        if len(meta_antiox[STRAIN_COLNAME].unique()) == 2 or (
                len(meta_antiox[STRAIN_COLNAME].unique()) > 2
                and reject_corrected.loc[FEATURE, args.test]):

            # t-test to use
            t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney'  # aka. Wilcoxon rank-sum
            ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format(
                (t_test + '_' + antiox))
            ttest_path = stats_dir / '{}_results.csv'.format(
                (t_test + '_' + antiox))
            ttest_path.parent.mkdir(exist_ok=True, parents=True)

            # perform t-tests (without correction for multiple testing)
            stats_t, pvals_t, reject_t = univariate_tests(
                X=feat_antiox,
                y=meta_antiox[STRAIN_COLNAME],
                control=CONTROL_STRAIN,
                test=t_test,
                comparison_type='binary_each_group',
                multitest_correction=None,
                alpha=0.05)
            # get effect sizes for comparisons
            effect_sizes_t = get_effect_sizes(X=feat_antiox,
                                              y=meta_antiox[STRAIN_COLNAME],
                                              control=CONTROL_STRAIN,
                                              effect_type=None,
                                              linked_test=t_test)

            # compile + save t-test results (uncorrected)
            stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
            pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
            reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
            effect_sizes_t.columns = [
                'effect_size_' + str(c) for c in effect_sizes_t.columns
            ]
            ttest_uncorrected = pd.concat(
                [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
            ttest_uncorrected.to_csv(ttest_path_uncorrected,
                                     header=True,
                                     index=True)

            # correct for multiple comparisons
            pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
            reject_t, pvals_t = _multitest_correct(
                pvals_t,
                multitest_method=args.fdr_method,
                fdr=args.pval_threshold)

            # compile + save t-test results (corrected)
            pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
            reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
            ttest_corrected = pd.concat(
                [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
            ttest_corrected.to_csv(ttest_path, header=True, index=True)

            nsig = reject_t.loc[FEATURE].sum()
            print("%d %ss differ from %s in '%s' on %s (%s, P<%.2f, %s)" %
                  (nsig, STRAIN_COLNAME, CONTROL_STRAIN, FEATURE, antiox,
                   t_test, args.pval_threshold, args.fdr_method))

    ### For each strain in turn...

    for strain in strain_list:
        print("\n%s" % strain)
        meta_strain = metadata[metadata[STRAIN_COLNAME] == strain]
        feat_strain = features.reindex(meta_strain.index)

        ### ANOVA tests for significant feature variation in antioxidant treatment

        # make path to save ANOVA results
        test_path_unncorrected = stats_dir / '{}_uncorrected.csv'.format(
            (args.test + '_' + strain))
        test_path = stats_dir / '{}_results.csv'.format(
            (args.test + '_' + strain))
        test_path.parent.mkdir(exist_ok=True, parents=True)

        if len(meta_strain[TREATMENT_COLNAME].unique()) > 2:
            # perform ANOVA + record results before & after correcting for multiple comparisons
            stats, pvals, reject = univariate_tests(
                X=feat_strain,
                y=meta_strain[TREATMENT_COLNAME],
                test=args.test,
                control=CONTROL_TREATMENT,
                comparison_type='multiclass',
                multitest_correction=None,  # uncorrected
                alpha=args.pval_threshold,
                n_permutation_test=None)  # 'all'

            # get effect sizes
            effect_sizes = get_effect_sizes(X=feat_strain,
                                            y=meta_strain[TREATMENT_COLNAME],
                                            control=CONTROL_TREATMENT,
                                            effect_type=None,
                                            linked_test=args.test)

            # compile + save results (uncorrected)
            test_results = pd.concat([stats, effect_sizes, pvals, reject],
                                     axis=1)
            test_results.columns = ['stats', 'effect_size', 'pvals', 'reject']
            test_results['significance'] = sig_asterix(test_results['pvals'])
            test_results = test_results.sort_values(
                by=['pvals'], ascending=True)  # rank pvals
            test_results.to_csv(test_path_unncorrected,
                                header=True,
                                index=True)

            # correct for multiple comparisons
            reject_corrected, pvals_corrected = _multitest_correct(
                pvals,
                multitest_method=args.fdr_method,
                fdr=args.pval_threshold)

            # compile + save results (corrected)
            test_results = pd.concat(
                [stats, effect_sizes, pvals_corrected, reject_corrected],
                axis=1)
            test_results.columns = ['stats', 'effect_size', 'pvals', 'reject']
            test_results['significance'] = sig_asterix(test_results['pvals'])
            test_results = test_results.sort_values(
                by=['pvals'], ascending=True)  # rank pvals
            test_results.to_csv(test_path, header=True, index=True)

            print("%s differences in '%s' across %ss for %s (%s, P<%.2f, %s)" %
                  (("SIGNIFICANT" if reject_corrected.loc[FEATURE, args.test]
                    else "No"), FEATURE, TREATMENT_COLNAME, strain, args.test,
                   args.pval_threshold, args.fdr_method))
        else:
            print("\nWARNING: Not enough %s groups for %s (n=%d)" %\
                  (TREATMENT_COLNAME, args.test, len(antioxidant_list)))

        ### t-tests comparing each antioxidant treatment to no antioxidant for each strain

        if len(meta_strain[TREATMENT_COLNAME].unique()) == 2 or (
                len(meta_strain[TREATMENT_COLNAME].unique()) > 2
                and reject_corrected.loc[FEATURE, args.test]):
            # t-test to use
            t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney'  # aka. Wilcoxon rank-sum
            ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format(
                (t_test + '_' + strain))
            ttest_path = stats_dir / '{}_results.csv'.format(
                (t_test + '_' + strain))
            ttest_path.parent.mkdir(exist_ok=True, parents=True)

            # perform t-tests (without correction for multiple testing)
            stats_t, pvals_t, reject_t = univariate_tests(
                X=feat_strain,
                y=meta_strain[TREATMENT_COLNAME],
                control=CONTROL_TREATMENT,
                test=t_test,
                comparison_type='binary_each_group',
                multitest_correction=None,
                alpha=0.05)
            # get effect sizes for comparisons
            effect_sizes_t = get_effect_sizes(X=feat_strain,
                                              y=meta_strain[TREATMENT_COLNAME],
                                              control=CONTROL_TREATMENT,
                                              effect_type=None,
                                              linked_test=t_test)

            # compile + save t-test results (uncorrected)
            stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
            pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
            reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
            effect_sizes_t.columns = [
                'effect_size_' + str(c) for c in effect_sizes_t.columns
            ]
            ttest_uncorrected = pd.concat(
                [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
            ttest_uncorrected.to_csv(ttest_path_uncorrected,
                                     header=True,
                                     index=True)

            # correct for multiple comparisons
            pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
            reject_t, pvals_t = _multitest_correct(
                pvals_t,
                multitest_method=args.fdr_method,
                fdr=args.pval_threshold)

            # compile + save t-test results (corrected)
            pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
            reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
            ttest_corrected = pd.concat(
                [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
            ttest_corrected.to_csv(ttest_path, header=True, index=True)

            nsig = reject_t.loc[FEATURE].sum()
            print("%d %ss differ from %s in '%s' for %s (%s, P<%.2f, %s)" %
                  (nsig, TREATMENT_COLNAME, CONTROL_TREATMENT, FEATURE, strain,
                   t_test, args.pval_threshold, args.fdr_method))

    ### Pairwise t-tests comparing strain vs control behaviour on each antioxidant
    print("\nPerforming pairwise t-tests:")
    # subset for control data
    control_strain_meta = metadata[metadata[STRAIN_COLNAME] == CONTROL_STRAIN]
    control_strain_feat = features.reindex(control_strain_meta.index)
    control_df = control_strain_meta.join(control_strain_feat)

    for strain in strain_list:
        if strain == CONTROL_STRAIN:
            continue

        # subset for strain data
        strain_meta = metadata[metadata[STRAIN_COLNAME] == strain]
        strain_feat = features.reindex(strain_meta.index)
        strain_df = strain_meta.join(strain_feat)

        # perform pairwise t-tests comparing strain with control for each antioxidant treatment
        stats, pvals, reject = pairwise_ttest(control_df,
                                              strain_df,
                                              feature_list=[FEATURE],
                                              group_by=TREATMENT_COLNAME,
                                              fdr_method=args.fdr_method,
                                              fdr=args.pval_threshold)

        # compile table of results
        stats.columns = ['stats_' + str(c) for c in stats.columns]
        pvals.columns = ['pvals_' + str(c) for c in pvals.columns]
        reject.columns = ['reject_' + str(c) for c in reject.columns]
        test_results = pd.concat([stats, pvals, reject], axis=1)

        # save results
        ttest_strain_path = stats_dir / 'pairwise_ttests' / '{}_results.csv'.format(
            strain + "_vs_" + CONTROL_STRAIN)
        ttest_strain_path.parent.mkdir(parents=True, exist_ok=True)
        test_results.to_csv(ttest_strain_path, header=True, index=True)

        for antiox in antioxidant_list:
            print("%s difference in '%s' between %s vs %s on %s (paired t-test, P=%.3f, %s)" %\
                  (("SIGNIFICANT" if reject.loc[FEATURE, 'reject_{}'.format(antiox)] else "No"),
                  FEATURE, strain, CONTROL_STRAIN, antiox, pvals.loc[FEATURE, 'pvals_{}'.format(antiox)],
                  args.fdr_method))
def keio_stats(features, metadata, args):
    """ Perform statistical analyses on Keio screen results:
        - ANOVA tests for significant between strain variation among all strains for each feature
        - t-tests for significant differences between each strain and control for each feature
        - k-significant feature selection for agreement with ANOVA significant feature set
        
        Inputs
        ------
        features, metadata : pd.DataFrame
            Clean feature summaries and accompanying metadata
        
        args : Object
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - omit_strains : list
            - grouping_variable : str
            - control_dict : dict
            - collapse_control : bool
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int           
    """

    # categorical variable to investigate, eg.'gene_name'
    grouping_var = args.grouping_variable
    print("\nInvestigating '%s' variation" % grouping_var)    

    # assert there will be no errors duee to case-sensitivity
    assert len(metadata[grouping_var].unique()) == len(metadata[grouping_var].str.upper().unique())
    
    # Subset results (rows) to omit selected strains
    if args.omit_strains is not None:
        features, metadata = subset_results(features, metadata, grouping_var, args.omit_strains)

    # Load Tierpsy Top feature set + subset (columns) for top feats only
    if args.n_top_feats is not None:
        top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats))
        topfeats = load_topfeats(top_feats_path, add_bluelight=args.align_bluelight, 
                                 remove_path_curvature=True, header=None)
        
        # Drop features that are not in results
        top_feats_list = [feat for feat in list(topfeats) if feat in features.columns]
        features = features[top_feats_list]
    
    assert not features.isna().any().any()
    
    strain_list = list(metadata[grouping_var].unique())
    control = args.control_dict[grouping_var] # control strain to use
    assert control in strain_list
    
    if args.collapse_control:
        print("Collapsing control data (mean of each day)")
        features, metadata = average_plate_control_data(features, 
                                                        metadata, 
                                                        control=control, 
                                                        grouping_var=grouping_var, 
                                                        plate_var='imaging_plate_id')

    _ = df_summary_stats(metadata) # summary df # TODO: plot from?

    # Record mean sample size per group
    mean_sample_size = int(np.round(metadata.join(features).groupby([grouping_var], 
                                                                    as_index=False).size().mean()))
    print("Mean sample size: %d" % mean_sample_size)

    # construct save paths (args.save_dir / topfeats? etc)
    save_dir = get_save_dir(args)
    stats_dir =  save_dir / grouping_var / "Stats" / args.fdr_method
    plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method              

#%% F-test for equal variances

    # Compare variance in samples with control (and correct for multiple comparisons)
    # Sample size matters in that unequal variances don't pose a problem for a t-test with 
    # equal sample sizes. So as long as your sample sizes are equal, you don't have to worry about 
    # homogeneity of variances. If they are not equal, perform F-tests first to see if variance is 
    # equal before doing a t-test
    if args.f_test:
        levene_stats_path = stats_dir / 'levene_results.csv'
        levene_stats = levene_f_test(features, metadata, grouping_var, 
                                      p_value_threshold=args.pval_threshold, 
                                      multitest_method=args.fdr_method,
                                      saveto=levene_stats_path,
                                      del_if_exists=False)
        # if p < 0.05 then variances are not equal, and sample size matters
        prop_eqvar = (levene_stats['pval'] > args.pval_threshold).sum() / len(levene_stats['pval'])
        print("Percentage equal variance %.1f%%" % (prop_eqvar * 100))
          
#%% ANOVA / Kruskal-Wallis tests for significantly different features across groups

    test_path_unncorrected = stats_dir / '{}_results_uncorrected.csv'.format(args.test)
    test_path = stats_dir / '{}_results.csv'.format(args.test)
    
    if not (test_path.exists() and test_path_unncorrected.exists()):
        test_path.parent.mkdir(exist_ok=True, parents=True)
    
        if (args.test == "ANOVA" or args.test == "Kruskal"):
            if len(strain_list) > 2:   
                # perform ANOVA + record results before & after correcting for multiple comparisons               
                stats, pvals, reject = univariate_tests(X=features, 
                                                        y=metadata[grouping_var], 
                                                        control=control, 
                                                        test=args.test,
                                                        comparison_type='multiclass',
                                                        multitest_correction=None, # uncorrected
                                                        alpha=args.pval_threshold,
                                                        n_permutation_test='all')

                # get effect sizes
                effect_sizes = get_effect_sizes(X=features, 
                                                y=metadata[grouping_var],
                                                control=control,
                                                effect_type=None,
                                                linked_test=args.test)

                # compile + save results (uncorrected)
                test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1)
                test_results.columns = ['stats','effect_size','pvals','reject']     
                test_results['significance'] = sig_asterix(test_results['pvals'])
                test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals
                test_results.to_csv(test_path_unncorrected, header=True, index=True)

                # correct for multiple comparisons
                reject_corrected, pvals_corrected = _multitest_correct(pvals, 
                                                                       multitest_method=args.fdr_method,
                                                                       fdr=args.pval_threshold)
                                            
                # compile + save results (corrected)
                test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1)
                test_results.columns = ['stats','effect_size','pvals','reject']     
                test_results['significance'] = sig_asterix(test_results['pvals'])
                test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals
                test_results.to_csv(test_path, header=True, index=True)
        
                # use reject mask to find significant feature set
                fset = pvals.loc[reject[args.test]].sort_values(by=args.test, ascending=True).index.to_list()
                #assert set(fset) == set(anova_corrected['pvals'].index[np.where(anova_corrected['pvals'] < 
                #args.pval_threshold)[0]])

                if len(fset) > 0:
                    print("%d significant features found by %s for '%s' (P<%.2f, %s)" % (len(fset), 
                          args.test, grouping_var, args.pval_threshold, args.fdr_method))
                    anova_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(args.test)
                    write_list_to_file(fset, anova_sigfeats_path)
            else:
                fset = []
                print("\nWARNING: Not enough groups for %s for '%s' (n=%d groups)" %\
                      (args.test, grouping_var, len(strain_list)))
                    
#%% Linear Mixed Models (LMMs), accounting for day-to-day variation
        # NB: Ideally report:  parameter | beta | lower-95 | upper-95 | random effect (SD)
        elif args.test == 'LMM':
            with warnings.catch_warnings():
                # Filter warnings as parameter is often on the boundary
                warnings.filterwarnings("ignore")
                #warnings.simplefilter("ignore", ConvergenceWarning)
                (signif_effect, low_effect,  error, mask, pvals
                 ) = compounds_with_low_effect_univariate(feat=features, 
                                                          drug_name=metadata[grouping_var], 
                                                          drug_dose=None, 
                                                          random_effect=metadata[args.lmm_random_effect], 
                                                          control=control, 
                                                          test=args.test, 
                                                          comparison_type='multiclass',
                                                          multitest_method=args.fdr_method)
            assert len(error) == 0

            # save pvals
            pvals.to_csv(test_path_unncorrected, header=True, index=True)

            # save significant features -- if any strain significant for any feature
            fset = pvals.columns[(pvals < args.pval_threshold).any()].to_list()
            if len(fset) > 0:
                lmm_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(args.test)
                write_list_to_file(fset, lmm_sigfeats_path)

            # save significant effect strains
            if len(signif_effect) > 0:
                print(("%d significant features found (%d significant %ss vs %s control, "\
                      % (len(fset), len(signif_effect), grouping_var.replace('_',' '), 
                          control) if len(signif_effect) > 0 else\
                      "No significant differences found between %s "\
                      % grouping_var.replace('_',' '))
                      + "after accounting for %s variation, %s, P<%.2f, %s)"\
                      % (args.lmm_random_effect.split('_yyyymmdd')[0], args.test, 
                          args.pval_threshold, args.fdr_method))
                signif_effect_path = stats_dir / '{}_signif_effect_strains.txt'.format(args.test)
                write_list_to_file(signif_effect, signif_effect_path)
        
        else:
            raise IOError("Test '{}' not recognised".format(args.test))
    
#%% t-tests / Mann-Whitney tests
    
    # t-test to use        
    t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum      
    ttest_path_uncorrected = stats_dir / '{}_results_uncorrected.csv'.format(t_test)
    ttest_path = stats_dir / '{}_results.csv'.format(t_test)               

    if not (ttest_path_uncorrected.exists() and ttest_path.exists()):    
        ttest_path.parent.mkdir(exist_ok=True, parents=True)

        if len(fset) > 0 or len(strain_list) == 2:
            # perform t-tests (without correction for multiple testing)
            stats_t, pvals_t, reject_t = univariate_tests(X=features, 
                                                          y=metadata[grouping_var], 
                                                          control=control, 
                                                          test=t_test,
                                                          comparison_type='binary_each_group',
                                                          multitest_correction=None, 
                                                          alpha=0.05)
            # get effect sizes for comparisons
            effect_sizes_t =  get_effect_sizes(X=features, 
                                               y=metadata[grouping_var], 
                                               control=control,
                                               effect_type=None,
                                               linked_test=t_test)
            
            # compile + save t-test results (uncorrected)
            stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
            pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
            reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
            effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
            ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
            ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True)
            
            # correct for multiple comparisons
            pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns]
            reject_t, pvals_t = _multitest_correct(pvals_t, 
                                                   multitest_method=args.fdr_method,
                                                   fdr=args.pval_threshold)

            # compile + save t-test results (corrected)
            pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
            reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
            ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)
            ttest_corrected.to_csv(ttest_path, header=True, index=True)

            # record t-test significant features (not ordered)
            fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list()
            #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
            print("%d significant features found for any %s vs %s (%s, P<%.2f)" %\
                  (len(fset_ttest), grouping_var, control, t_test, args.pval_threshold))

            if len(fset_ttest) > 0:
                ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(t_test)
                write_list_to_file(fset_ttest, ttest_sigfeats_path)
                                 
#%% K significant features
    
    ksig_uncorrected_path = stats_dir / 'k_significant_features_uncorrected.csv'
    ksig_corrected_path = stats_dir / 'k_significant_features.csv'
    if not (ksig_uncorrected_path.exists() and ksig_corrected_path.exists()):
        ksig_corrected_path.parent.mkdir(exist_ok=True, parents=True)      
        fset_ksig, (scores, pvalues_ksig), support = k_significant_feat(feat=features, 
                                                                        y_class=metadata[grouping_var], 
                                                                        k=len(fset),
                                                                        score_func='f_classif', 
                                                                        scale=None, 
                                                                        feat_names=None, 
                                                                        plot=False, 
                                                                        k_to_plot=None, 
                                                                        close_after_plotting=True,
                                                                        saveto=None, #k_sigfeat_dir
                                                                        figsize=None, 
                                                                        title=None, 
                                                                        xlabel=None)
        # compile + save k-significant features (uncorrected) 
        ksig_table = pd.concat([pd.Series(scores), pd.Series(pvalues_ksig)], axis=1)
        ksig_table.columns = ['scores','pvals']
        ksig_table.index = fset_ksig
        ksig_table.to_csv(ksig_uncorrected_path, header=True, index=True)   
        
        # Correct for multiple comparisons
        _, ksig_table['pvals'] = _multitest_correct(ksig_table['pvals'], 
                                                    multitest_method=args.fdr_method,
                                                    fdr=args.pval_threshold)
        
        # save k-significant features (corrected)
        ksig_table.to_csv(ksig_corrected_path, header=True, index=True)   

#%% mRMR feature selection: minimum Redunduncy, Maximum Relevance #####
    
    mrmr_dir = plot_dir / 'mrmr'
    mrmr_dir.mkdir(exist_ok=True, parents=True)
    mrmr_results_path = mrmr_dir / "mrmr_results.csv"

    if not mrmr_results_path.exists():
        estimator = Pipeline([('scaler', StandardScaler()), ('estimator', LogisticRegression())])
        y = metadata[grouping_var].values
        (mrmr_feat_set, 
         mrmr_scores, 
         mrmr_support) = mRMR_feature_selection(features, y_class=y, k=10,
                                                redundancy_func='pearson_corr',
                                                relevance_func='kruskal',
                                                n_bins=10, mrmr_criterion='MID',
                                                plot=True, k_to_plot=5, 
                                                close_after_plotting=True,
                                                saveto=mrmr_dir, figsize=None)
        # save results                                        
        mrmr_table = pd.concat([pd.Series(mrmr_feat_set), pd.Series(mrmr_scores)], axis=1)
        mrmr_table.columns = ['feature','score']
        mrmr_table.to_csv(mrmr_results_path, header=True, index=False)
        
        n_cv = 5
        cv_scores_mrmr = cross_val_score(estimator, features[mrmr_feat_set], y, cv=n_cv)
        cv_scores_mrmr = pd.DataFrame(cv_scores_mrmr, columns=['cv_score'])
        cv_scores_mrmr.to_csv(mrmr_dir / "cv_scores.csv", header=True, index=False)        
        print('MRMR CV Score: %f (n=%d)' % (np.mean(cv_scores_mrmr), n_cv))        
    else:
        # load mrmr results
        mrmr_table = pd.read_csv(mrmr_results_path)
        
    mrmr_feat_set = mrmr_table['feature'].to_list()
    print("\nTop %d features found by MRMR:" % len(mrmr_feat_set))
    for feat in mrmr_feat_set:
        print(feat)
def control_variation(feat, 
                      meta, 
                      args,
                      variables=['date_yyyymmdd','instrument_name','imaging_run_number'],
                      n_sig_features=None):
    """ Analyse variation in control data with respect to each categorical variable in 'variables'
        
        Inputs
        ------
        feat, meta : pd.DataFrame
            Matching features summaries and metadata for control data
        
        args : Object 
            Python object with the following attributes:
            - remove_outliers : bool
            - grouping_variable : str
            - control_dict : dict
            - test : str
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int
            - n_top_feats : int
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            
        variables : list
            List of categorical random variables to analyse variation in control data
    """
    
    assert set(feat.index) == set(meta.index)
            
    save_dir = get_save_dir(args) / "control"

    # Stats test to use
    assert args.test in ['ANOVA','Kruskal','LMM']
    t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sums
    
    for grouping_var in tqdm(variables):
        
        # convert grouping variable column to factor (categorical)
        meta[grouping_var] = meta[grouping_var].astype(str)
                
        # get control group for eg. date_yyyymmdd
        control_group = str(args.control_dict[grouping_var])
        print("\nInvestigating variation in '%s' (control: '%s')" % (grouping_var, control_group))

        # Record mean sample size per group
        mean_sample_size = int(np.round(meta.groupby([grouping_var]).size().mean()))
        print("Mean sample size: %d" % mean_sample_size)
        
        group_list = list(meta[grouping_var].unique())
        stats_dir =  save_dir / "Stats" / grouping_var
        plot_dir = save_dir / "Plots" / grouping_var

        ##### STATISTICS #####
                      
        stats_path = stats_dir / '{}_results.csv'.format(args.test) # LMM/ANOVA/Kruskal  
        ttest_path = stats_dir / '{}_results.csv'.format(t_test)
    
        if not np.logical_and(stats_path.exists(), ttest_path.exists()):
            stats_path.parent.mkdir(exist_ok=True, parents=True)
            ttest_path.parent.mkdir(exist_ok=True, parents=True)
        
            ### ANOVA / Kruskal-Wallis tests for significantly different features across groups
            if (args.test == "ANOVA" or args.test == "Kruskal"):
                if len(group_list) > 2:                    
                    stats, pvals, reject = univariate_tests(X=feat, 
                                                            y=meta[grouping_var], 
                                                            control=control_group, 
                                                            test=args.test,
                                                            comparison_type='multiclass',
                                                            multitest_correction=args.fdr_method, 
                                                            alpha=0.05)
                    # get effect sizes
                    effect_sizes = get_effect_sizes(X=feat, 
                                                    y=meta[grouping_var],
                                                    control=control_group,
                                                    effect_type=None,
                                                    linked_test=args.test)
                                                
                    anova_table = pd.concat([stats, effect_sizes, pvals, reject], axis=1)
                    anova_table.columns = ['stats','effect_size','pvals','reject']     

                    anova_table['significance'] = sig_asterix(anova_table['pvals'])
        
                    # Sort pvals + record significant features
                    anova_table = anova_table.sort_values(by=['pvals'], ascending=True)
                    fset = list(anova_table['pvals'].index[np.where(anova_table['pvals'] < 
                                                                    args.pval_threshold)[0]])
                    
                    # Save statistics results + significant feature set to file
                    anova_table.to_csv(stats_path, header=True, index=True)
            
                    if len(fset) > 0:
                        anova_sigfeats_path = Path(str(stats_path).replace('_results.csv', '_sigfeats.txt'))
                        write_list_to_file(fset, anova_sigfeats_path)
                        print("\n%d significant features found by %s for '%s' (P<%.2f, %s)" %\
                              (len(fset), args.test, grouping_var, args.pval_threshold, args.fdr_method))
                else:
                    fset = []
                    print("\nWARNING: Not enough groups for %s for '%s' (n=%d groups)" %\
                          (args.test, grouping_var, len(group_list)))
            
            ### t-tests / Mann-Whitney tests
            if len(fset) > 0 or len(group_list) == 2:
                stats_t, pvals_t, reject_t = univariate_tests(X=feat, 
                                                              y=meta[grouping_var], 
                                                              control=control_group, 
                                                              test=t_test,
                                                              comparison_type='binary_each_group',
                                                              multitest_correction=args.fdr_method, 
                                                              alpha=0.05)
                effect_sizes_t =  get_effect_sizes(X=feat, y=meta[grouping_var], 
                                                   control=control_group,
                                                   effect_type=None,
                                                   linked_test=t_test)
                
                stats_t.columns = ['stats_' + str(c) for c in stats_t.columns]
                pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns]
                reject_t.columns = ['reject_' + str(c) for c in reject_t.columns]
                effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns]
                
                ttest_table = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1)

                # Record t-test significant feature set (NOT ORDERED)
                fset_ttest = list(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0])
                
                # Save t-test results to file
                ttest_table.to_csv(ttest_path, header=True, index=True) # Save test results to CSV

                if len(fset_ttest) > 0:
                    ttest_sigfeats_path = Path(str(ttest_path).replace('_results.csv', '_sigfeats.txt'))
                    write_list_to_file(fset_ttest, ttest_sigfeats_path)
                    print("%d signficant features found for any %s vs %s (%s, P<%.2f)" %\
                          (len(fset_ttest), grouping_var, control_group, t_test, args.pval_threshold))
                
                # Barplot of number of significantly different features for each strain   
                barplot_sigfeats(test_pvalues_df=pvals_t, 
                                 saveDir=plot_dir,
                                 p_value_threshold=args.pval_threshold,
                                 test_name=t_test)
                                 
        ### Load statistics results
        
        # Read ANOVA results and record significant features
        print("\nLoading statistics results")
        if len(group_list) > 2:
            anova_table = pd.read_csv(stats_path, index_col=0)
            pvals = anova_table.sort_values(by='pvals', ascending=True)['pvals']
            fset = pvals[pvals < args.pval_threshold].index.to_list()
            print("%d significant features found by %s (P<%.2f)" % (len(fset), args.test, 
                                                                    args.pval_threshold))
        
        # Read t-test results and record significant features (NOT ORDERED)
        ttest_table = pd.read_csv(ttest_path, index_col=0)
        pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]]             
        fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum(axis=1) > 0].index.to_list()
        print("%d significant features found by %s (P<%.2f)" % (len(fset_ttest), t_test, args.pval_threshold))
            
        # Use t-test significant feature set if comparing just 2 strains
        if len(group_list) == 2:
            fset = fset_ttest
                       
        if not n_sig_features:
            if args.n_sig_features is not None:
                n_sig_features = args.n_sig_features 
            else:
                n_sig_features = len(fset)
                                   
        ##### Plotting #####
        
        superplot_dir = plot_dir / 'superplots' 

        if len(fset) > 1:        
            for feature in tqdm(fset[:n_sig_features]):                
                # plot variation in variable with respect to 'date_yyyymmdd'
                superplot(feat, meta, feature, 
                          x1=grouping_var, 
                          x2=None if grouping_var == 'date_yyyymmdd' else 'date_yyyymmdd',
                          saveDir=superplot_dir,
                          pvals=pvals_t if grouping_var == 'date_yyyymmdd' else None,
                          pval_threshold=args.pval_threshold,
                          show_points=True, 
                          plot_means=True,
                          dodge=True)
                # plot variation in variable with respect to 'instrument_name'
                superplot(feat, meta, feature, 
                          x1=grouping_var, 
                          x2=None if grouping_var == 'instrument_name' else 'instrument_name',
                          saveDir=superplot_dir,
                          pvals=pvals_t if grouping_var == 'instrument_name' else None,
                          pval_threshold=args.pval_threshold,
                          show_points=True, 
                          plot_means=True,
                          dodge=True)
                # plot variation in variable with respect to 'imaging_ruun_number'
                superplot(feat, meta, feature, 
                          x1=grouping_var, 
                          x2=None if grouping_var == 'imaging_run_number' else 'imaging_run_number',
                          saveDir=superplot_dir,
                          pvals=pvals_t if grouping_var == 'imaging_run_number' else None,
                          pval_threshold=args.pval_threshold,
                          show_points=True, 
                          plot_means=True,
                          dodge=True)
                            
            # # Boxplots of significant features by ANOVA/LMM (all groups)
            # boxplots_grouped(feat_meta_df=meta.join(feat), 
            #                  group_by=grouping_var,
            #                  control_group=str(control_group),
            #                  test_pvalues_df=pvals_t.T, # ranked by test pvalue significance
            #                  feature_set=fset,
            #                  saveDir=(plot_dir / 'grouped_boxplots'),
            #                  max_feats2plt=args.n_sig_features, 
            #                  max_groups_plot_cap=None,
            #                  p_value_threshold=args.pval_threshold,
            #                  drop_insignificant=False,
            #                  sns_colour_palette="tab10",
            #                  figsize=[6, (len(group_list)/3 if len(group_list)>10 else 12)])
                    
            # Individual boxplots of significant features by pairwise t-test (each group vs control)
            # boxplots_sigfeats(feat_meta_df=meta.join(feat), 
            #                   test_pvalues_df=pvals_t, 
            #                   group_by=grouping_var, 
            #                   control_strain=control_group, 
            #                   feature_set=fset, #['speed_norm_50th_bluelight'],
            #                   saveDir=plot_dir / 'paired_boxplots',
            #                   max_feats2plt=args.n_sig_features,
            #                   p_value_threshold=args.pval_threshold,
            #                   drop_insignificant=True,
            #                   verbose=False)
                
            # from tierpsytools.analysis.significant_features import plot_feature_boxplots
            # plot_feature_boxplots(feat_to_plot=fset,
            #                       y_class=grouping_var,
            #                       scores=pvalues.rank(axis=1),
            #                       feat=feat,
            #                       pvalues=np.asarray(pvalues).flatten(),
            #                       saveto=None,
            #                       close_after_plotting=False)
        
        ##### Hierarchical Clustering Analysis #####
        print("\nPerforming hierarchical clustering analysis...")

        assert not feat.isna().sum(axis=1).any()
        assert not (feat.std(axis=1) == 0).any()
        
        # Z-normalise data
        featZ = feat.apply(zscore, axis=0)
        #featZ = (feat-feat.mean())/feat.std() # minus mean, divide by std
        
        #from tierpsytools.preprocessing.scaling_class import scalingClass
        #scaler = scalingClass(scaling='standardize')
        #featZ = scaler.fit_transform(feat)

        # NOT NEEDED?
        # # Drop features with NaN values after normalising
        # n_cols = len(featZ.columns)
        # featZ.dropna(axis=1, inplace=True)
        # n_dropped = n_cols - len(featZ.columns)
        # if n_dropped > 0:
        #     print("Dropped %d features after normalisation (NaN)" % n_dropped)
    
        ### Control clustermap
        
        # control data is clustered and feature order is stored and applied to full data
        if len(group_list) > 1 and len(group_list) < 50 and grouping_var != 'date_yyyymmdd':
            control_clustermap_path = plot_dir / 'heatmaps' / (grouping_var + '_date_clustermap.pdf')
            cg = plot_clustermap(featZ, meta,
                                 group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' 
                                           else [grouping_var, 'date_yyyymmdd']),
                                 col_linkage=None,
                                 method=METHOD,#[linkage, complete, average, weighted, centroid]
                                 metric=METRIC,
                                 figsize=[15,8],
                                 sub_adj={'bottom':0.02,'left':0.02,'top':1,'right':0.85},
                                 label_size=12,
                                 show_xlabels=False,
                                 saveto=control_clustermap_path)
    
            #col_linkage = cg.dendrogram_col.calculated_linkage
            clustered_features = np.array(featZ.columns)[cg.dendrogram_col.reordered_ind]
        else:
            clustered_features = None
                    
        ## Save z-normalised values
        # z_stats = featZ.join(meta[grouping_var]).groupby(by=grouping_var).mean().T
        # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()]
        # z_stats.to_csv(z_stats_path, header=True, index=None)
        
        # Clustermap of full data       
        full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var + '_clustermap.pdf')
        fg = plot_clustermap(featZ, meta, 
                             group_by=grouping_var,
                             col_linkage=None,
                             method=METHOD,
                             metric=METRIC,
                             figsize=[15,8],
                             sub_adj={'bottom':0.02,'left':0.02,'top':1,'right':0.9},
                             label_size=12,
                             saveto=full_clustermap_path)
        
        # If no control clustering (due to no day variation) then use clustered features for all 
        # strains to order barcode heatmaps
        if clustered_features is None:
            clustered_features = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind]
        
        if len(group_list) > 2:
            pvals_heatmap = anova_table.loc[clustered_features, 'pvals']
        elif len(group_list) == 2:
            pvals_heatmap = pvals_t.loc[clustered_features, pvals_t.columns[0]]
        pvals_heatmap.name = 'P < {}'.format(args.pval_threshold)
    
        assert all(f in featZ.columns for f in pvals_heatmap.index)
                
        # Plot barcode heatmap (grouping by date)
        if len(group_list) > 1 and len(group_list) < 50 and grouping_var != 'date_yyyymmdd':
            heatmap_date_path = plot_dir / 'heatmaps' / (grouping_var + '_date_heatmap.pdf')
            plot_barcode_heatmap(featZ=featZ[clustered_features], 
                                 meta=meta, 
                                 group_by=[grouping_var, 'date_yyyymmdd'],
                                 pvalues_series=pvals_heatmap,
                                 p_value_threshold=args.pval_threshold,
                                 selected_feats=fset if len(fset) > 0 else None,
                                 saveto=heatmap_date_path,
                                 figsize=[20,7],
                                 sns_colour_palette="Pastel1")
        
        # Plot group-mean heatmap (averaged across days)
        heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf')
        plot_barcode_heatmap(featZ=featZ[clustered_features], 
                             meta=meta, 
                             group_by=[grouping_var], 
                             pvalues_series=pvals_heatmap,
                             p_value_threshold=args.pval_threshold,
                             selected_feats=fset if len(fset) > 0 else None,
                             saveto=heatmap_path,
                             figsize=[20, (int(len(group_list) / 4) if len(group_list) > 10 else 6)],
                             sns_colour_palette="Pastel1")        
                        
        ##### Principal Components Analysis #####
        print("Performing principal components analysis")
    
        if args.remove_outliers:
            outlier_path = plot_dir / 'mahalanobis_outliers.pdf'
            feat, inds = remove_outliers_pca(df=feat, 
                                             features_to_analyse=None, 
                                             saveto=outlier_path)
            meta = meta.reindex(feat.index) # reindex metadata
            featZ = feat.apply(zscore, axis=0) # re-normalise data

            # Drop features with NaN values after normalising
            n_cols = len(featZ.columns)
            featZ.dropna(axis=1, inplace=True)
            n_dropped = n_cols - len(featZ.columns)
            if n_dropped > 0:
                print("Dropped %d features after normalisation (NaN)" % n_dropped)

        #from tierpsytools.analysis.decomposition import plot_pca
        pca_dir = plot_dir / 'PCA'
        _ = plot_pca(featZ, meta, 
                     group_by=grouping_var, 
                     control=control_group,
                     var_subset=None, 
                     saveDir=pca_dir,
                     PCs_to_keep=10,
                     n_feats2print=10,
                     sns_colour_palette="plasma",
                     n_dims=2,
                     label_size=15,
                     figsize=[9,8],
                     sub_adj={'bottom':0.13,'left':0.12,'top':0.98,'right':0.98},
                     # legend_loc='upper right',
                     # n_colours=20,
                     hypercolor=False)