Esempio n. 1
0
    def test_run_group_significance_test(self):
        """Test that all group significance tests can be run."""
        bt = parse_biom_table(BT_IN_1)
        bt_4 = parse_biom_table(BT_4)

        # test with non-paramteric t-test
        sample_indices = {'cat1': [0, 5, 1], 'cat2': [2, 4, 3]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.17503798979747345, 0.20029818620053824,
            -1.5065313062753816, -0.043884559904114794, -1.0631239617935129,
            -1.2878361428003895]
        # we are expecting 1001 comparisons)
        exp_pvals = map(lambda x: x/1001., [888, 899, 279, 1001, 489, 299])
        exp_means = [[52.333333333333336, 48.333333333333336],
            [34.0, 30.333333333333332],
            [20.0, 49.333333333333336],
            [55.333333333333336, 56.0],
            [20.0, 38.0],
            [30.0, 60.333333333333336]]
        seed(0) # seed prng for reproducibility
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'nonparametric_t_test', 
                GROUP_TEST_CHOICES, reps=1000)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with nonparametric t-test but different ordering 
        sample_indices = {'cat1': [0, 1, 5], 'cat2': [4, 3, 2]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        seed(0) # seed prng for reproducibility
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'nonparametric_t_test', 
                GROUP_TEST_CHOICES, reps=1000)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with BT_4 biom table
        sample_indices = {'cat1': [0,3,1,4], 'cat2': [5,2,7,6]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)
        exp_test_stats = [-0.38741397129147953, -0.38334158591463874,
            0.077468274988510541, -0.2322539745918096, 0.16469600468808282,
            -0.49589486133213057]
        # we are expecting 1001 comparisons)
        exp_pvals = map(lambda x: x/1001., [821,719,916,935,938,604])
        exp_means = [[43.5, 51.75],
            [29.75, 34.75],
            [41.5, 40.0],
            [50.5, 53.75],
            [28.0, 25.5],
            [41.75, 54.0]]
        seed(0)
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'nonparametric_t_test', 
                GROUP_TEST_CHOICES, reps=1000)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)

        # test with parametric t test
        # bt_1 agrees with Prism
        sample_indices = {'cat1': [4, 1, 2], 'cat2': [5, 0, 3]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [-1.0504514628777806, -0.94113003446934629,
            -0.66264262463016887, 0.17617555832772411,  1.1144416530351877,
            -1.2483315640812607]
        exp_pvals = [0.3527834167236007, 0.39992473225679626, 
            0.5437923932346147, 0.8687158192049661, 0.32753202812350557,
            0.27998887149482976]
        exp_means = [[39.666666666666664, 61.0],
            [24.333333333333332, 40.0],
            [27.0, 42.333333333333336],
            [57.0, 54.333333333333336],
            [38.333333333333336, 19.666666666666668],
            [30.333333333333332, 60.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'parametric_t_test', 
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with BT_4
        sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)
        exp_test_stats = [0.43577690622483684, -2.5911938781738648,
            -1.3573515147239095, 1.2101173913086851, 2.137178815882979,
             0.0099191576638653078]
        exp_pvals = [0.67823972846362579, 0.041145883121579255,
            0.2235024418313547, 0.27174025956151748, 0.076447615888438444,
             0.9924073718332862]
        exp_means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'parametric_t_test', 
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)

        # test with bootstrapped mann_whitney_u
        sample_indices = {'cat1': [4, 1, 2], 'cat2': [5, 0, 3]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [7.0, 7.0, 7.0, 6.0, 7.0, 7.0]
        exp_pvals = [0.333, 0.305, 0.3, 0.623, 0.295, 0.334]
        exp_means = [[39.666666666666664, 61.0],
            [24.333333333333332, 40.0],
            [27.0, 42.333333333333336],
            [57.0, 54.333333333333336],
            [38.333333333333336, 19.666666666666668],
            [30.333333333333332, 60.0]]
        seed(0) # seed prng for reproducibility
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'bootstrap_mann_whitney_u', 
                GROUP_TEST_CHOICES, reps=1000)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with BT_4
        sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)  
        exp_test_stats = [10.0, 15.0, 11.0, 14.0, 15.0, 9.0]
        exp_pvals = [0.605, 0.033, 0.414, 0.097, 0.041, 0.814]
        exp_means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        seed(0) # seed prng for reproducibility
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'bootstrap_mann_whitney_u', 
                GROUP_TEST_CHOICES, reps=1000)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)

        # test with parametric mann whitney u
        sample_indices = {'cat1': [0, 3, 1], 'cat2': [4, 2, 5]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [6.0, 6.0, 5.0, 5.0, 6.0, 5.0]
        exp_pvals = [0.51269076026192328, 0.51269076026192328,
            0.82725934656271127, 0.82725934656271127, 0.51269076026192328,
            0.82725934656271127]
        exp_means = [[52.666666666666664, 48.0],
            [23.666666666666668, 40.666666666666664],
            [34.0, 35.333333333333336],
            [56.333333333333336, 55.0],
            [32.333333333333336, 25.666666666666668],
            [46.0, 44.333333333333336]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'mann_whitney_u',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with BT_4
        sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)
        exp_test_stats = [10.0, 15.0, 11.0, 14.0, 15.0, 9.0]
        exp_pvals = [0.5637028616507731, 0.043308142810791955,
            0.38363032713198975, 0.083264516663550406, 0.043308142810791955,
            0.77282999268444752]
        exp_means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'mann_whitney_u',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)

        # test with ANOVA
        sample_indices = {'cat1': [0, 3], 'cat2': [4, 5], 'cat3': [2, 1]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.022340083574413375, 20.028268551236753,
            2.086854460093897, 0.96500593119810185, 4.8390804597701154,
            0.54346882684796749]
        exp_pvals = [0.97806870848824634, 0.018391757629969238,
            0.27043709109167957, 0.47468983920325486, 0.11510587547067222,
            0.62890473306440042]
        exp_means = [[53.0, 46.5, 51.5],
            [28.5, 55.5, 12.5],
            [50.0, 45.5, 8.5],
            [50.5, 47.5, 69.0],
            [28.0, 9.0, 50.0],
            [65.0, 39.5, 31.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'ANOVA',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with uneven group sizes
        sample_indices = {'cat1': [0, 2 ,3, 1], 'cat2': [4, 5]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.05663963168179019, 16.436058700209646,
            0.43828937472444823, 0.675244322576109, 4.7713717693836974,
            0.083541102077687446]
        exp_pvals = [0.8235822412182755, 0.015422975290359022,
            0.54414414026513325, 0.45738578176242134, 0.094285405564661875,
            0.78691584834507211]
        exp_means = [[52.25, 46.5],
            [20.5, 55.5],
            [29.25, 45.5],
            [59.75, 47.5],
            [39.0, 9.0],
            [48.0, 39.5]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'ANOVA',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with bt_4
        sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)
        exp_test_stats = [0.18990151199889027, 6.7142857142857144,
             1.8424031345232912, 1.4643841007477372, 4.5675332910589734,
             9.8389688760617899e-05]
        exp_pvals = [0.6782397284636259, 0.041145883121579234,
             0.22350244183135481, 0.27174025956151771, 0.076447615888438403,
             0.9924073718332751]
        exp_means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'ANOVA',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)

        # test with g goodness of fit
        sample_indices = {'cat1': [0, 3], 'cat2': [4, 5], 'cat3': [2, 1]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.46328913071721711,
            29.810689447160001, 37.234612591840595, 4.7031232724401875,
            31.207185565457102, 13.332324853339509]
        exp_pvals = [0.79322801392154108,
            3.3627225458535774e-07, 8.2149818410655555e-09, 0.09522034650579822,
            1.6728066897036456e-07, 0.00127327567601971]
        exp_means = [[53.0, 46.5, 51.5],
            [28.5, 55.5, 12.5],
            [50.0, 45.5, 8.5],
            [50.5, 47.5, 69.0],
            [28.0, 9.0, 50.0],
            [65.0, 39.5, 31.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'g_test',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with individual groups
        sample_indices = {'cat1': [0], 'cat2': [1], 'cat3': [3], 
            'cat4': [2], 'cat5': [5], 'cat6': [4]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [68.7536611489639, 62.908926545455522,
            115.84654226008865, 26.819713749563704, 84.940231595557307,
            105.37909384565077]
        exp_pvals = [1.8616725644907271e-13, 3.0403858229558975e-12,
            2.3772983815049693e-23, 6.1843461955812955e-05,
            7.7481603433718027e-17, 3.8768150325829967e-21]
        exp_means = [[28.0, 52.0, 78.0, 51.0, 77.0, 16.0],
            [25.0, 14.0, 32.0, 11.0, 63.0, 48.0],
            [31.0, 2.0, 69.0, 15.0, 27.0, 64.0],
            [36.0, 68.0, 65.0, 70.0, 62.0, 33.0],
            [16.0, 41.0, 40.0, 59.0, 3.0, 15.0],
            [32.0, 8.0, 98.0, 54.0, 50.0, 29.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'g_test',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with uneven length groups
        sample_indices = {'cat1': [0, 3, 4, 5], 'cat3': [2, 1]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.030099778845406742,
            16.703388149486191, 29.941854048163027, 3.39187772427496,
            14.935738277477988, 5.4519230964604013]
        exp_pvals = [0.86226402523867973, 4.3702877865113464e-05,
            4.451983032513133e-08, 0.065518295867083964, 0.00011123571448583719,
            0.019546798231055287]
        exp_means = [[49.75, 51.5],
            [42.0, 12.5],
            [47.75, 8.5],
            [49.0, 69.0],
            [18.5, 50.0],
            [52.25, 31.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'g_test',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with bt_4
        sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)
        exp_test_stats = [0.8950130401309585, 8.6948783805472942,
             6.5397009199496443, 2.2281537448054953, 11.541070115516771,
             0.00064935138712822981]
        exp_pvals = [0.34412242732851783, 0.0031910540870178925,
             0.010549308294222293, 0.13551569348660794, 0.00068075444949030543,
             0.97967020739471489]
        exp_means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'g_test',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)

        # test with Kruskal Wallis
        sample_indices = {'cat1': [0, 3], 'cat2': [4, 5], 'cat3': [2, 1]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.2857142857142847,
            4.5714285714285694, 3.7142857142857117, 3.7142857142857117,
            4.5714285714285694, 0.85714285714285765]
        exp_pvals = [0.86687789975018215, 0.10170139230422694,
            0.15611804531597129, 0.15611804531597129, 0.10170139230422694,
            0.65143905753105535]
        exp_means = [[53.0, 46.5, 51.5],
            [28.5, 55.5, 12.5],
            [50.0, 45.5, 8.5],
            [50.5, 47.5, 69.0],
            [28.0, 9.0, 50.0],
            [65.0, 39.5, 31.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'kruskal_wallis',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with individual groups
        sample_indices = {'cat1': [0], 'cat2': [1], 'cat3': [3], 
            'cat4': [2], 'cat5': [5], 'cat6': [4]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [5.0, 5.0, 5.0, 5.0, 5.0, 5.0]
        exp_pvals = [0.41588018699550794, 0.41588018699550794,
            0.41588018699550794, 0.41588018699550794, 0.41588018699550794,
            0.41588018699550794]
        exp_means = [[28.0, 52.0, 78.0, 51.0, 77.0, 16.0],
            [25.0, 14.0, 32.0, 11.0, 63.0, 48.0],
            [31.0, 2.0, 69.0, 15.0, 27.0, 64.0],
            [36.0, 68.0, 65.0, 70.0, 62.0, 33.0],
            [16.0, 41.0, 40.0, 59.0, 3.0, 15.0],
            [32.0, 8.0, 98.0, 54.0, 50.0, 29.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'kruskal_wallis',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with uneven length groups
        sample_indices = {'cat1': [0, 3, 4, 5], 'cat3': [2, 1]}
        row_gen = group_significance_row_generator(bt, sample_indices)
        exp_test_stats = [0.0, 3.428571428571427, 3.428571428571427,
            3.428571428571427, 3.428571428571427, 0.21428571428571175]
        exp_pvals = [1, 0.064077506451059238, 0.064077506451059238,
            0.064077506451059238, 0.064077506451059238, 0.64342884356362262]
        exp_means = [[49.75, 51.5],
            [42.0, 12.5],
            [47.75, 8.5],
            [49.0, 69.0],
            [18.5, 50.0],
            [52.25, 31.0]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'kruskal_wallis',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
        # test with bt_4
        sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}
        row_gen = group_significance_row_generator(bt_4, sample_indices)
        exp_test_stats = [0.33333333333333215, 4.0833333333333321, 
            0.75903614457831325, 3.0, 4.0833333333333321, 0.083333333333332149]
        exp_pvals = [0.56370286165077377, 0.043308142810792101,
            0.38363032713198986, 0.08326451666355042, 0.043308142810792101,
            0.77282999268444919]
        exp_means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        obs_test_stats, obs_pvals, obs_means = \
            run_group_significance_test(row_gen, 'kruskal_wallis',
                GROUP_TEST_CHOICES)
        self.assertFloatEqual(exp_test_stats, obs_test_stats)
        self.assertFloatEqual(exp_pvals, obs_pvals)
        self.assertFloatEqual(exp_means, obs_means)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError('The samples in the biom table are a superset of' +
                             ' the samples in the mapping file. The script will abort in' +
                             ' this case even though the calculations wouldn\'t be' +
                             ' affected, to ensure consistency within QIIME. Pass the' +
                             ' --biom_samples_are_superset option to disable this behavior.')
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError('At least one metadata group has no samples. Check ' +
                         'that the mapping file has at least one sample for each value in ' +
                         'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error('The t-test and mann_whitney_u test may ' +
                            'only be used when there are two sample groups. Choose another ' +
                            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError('The number of samples is too small to use the ' +
                             'Mann-Whitney-U normal approximation. Review the script ' +
                             'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError('It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test,
        GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt, test_stats, pvals,
                                                fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()
Esempio n. 3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError(
                'The samples in the biom table are a superset of' +
                ' the samples in the mapping file. The script will abort in' +
                ' this case even though the calculations wouldn\'t be' +
                ' affected, to ensure consistency within QIIME. Pass the' +
                ' --biom_samples_are_superset option to disable this behavior.'
            )
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError(
            'At least one metadata group has no samples. Check ' +
            'that the mapping file has at least one sample for each value in '
            + 'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error(
            'The t-test and mann_whitney_u test may ' +
            'only be used when there are two sample groups. Choose another ' +
            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError(
                'The number of samples is too small to use the ' +
                'Mann-Whitney-U normal approximation. Review the script ' +
                'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError(
                'It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt,
                                                test_stats,
                                                pvals,
                                                fdr_pvals,
                                                bon_pvals,
                                                means,
                                                cat_sam_indices,
                                                md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()