def test_run_group_significance_test(self): """Test that all group significance tests can be run.""" bt = parse_biom_table(BT_IN_1) bt_4 = parse_biom_table(BT_4) # test with non-paramteric t-test sample_indices = {'cat1': [0, 5, 1], 'cat2': [2, 4, 3]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.17503798979747345, 0.20029818620053824, -1.5065313062753816, -0.043884559904114794, -1.0631239617935129, -1.2878361428003895] # we are expecting 1001 comparisons) exp_pvals = map(lambda x: x/1001., [888, 899, 279, 1001, 489, 299]) exp_means = [[52.333333333333336, 48.333333333333336], [34.0, 30.333333333333332], [20.0, 49.333333333333336], [55.333333333333336, 56.0], [20.0, 38.0], [30.0, 60.333333333333336]] seed(0) # seed prng for reproducibility obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'nonparametric_t_test', GROUP_TEST_CHOICES, reps=1000) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with nonparametric t-test but different ordering sample_indices = {'cat1': [0, 1, 5], 'cat2': [4, 3, 2]} row_gen = group_significance_row_generator(bt, sample_indices) seed(0) # seed prng for reproducibility obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'nonparametric_t_test', GROUP_TEST_CHOICES, reps=1000) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with BT_4 biom table sample_indices = {'cat1': [0,3,1,4], 'cat2': [5,2,7,6]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [-0.38741397129147953, -0.38334158591463874, 0.077468274988510541, -0.2322539745918096, 0.16469600468808282, -0.49589486133213057] # we are expecting 1001 comparisons) exp_pvals = map(lambda x: x/1001., [821,719,916,935,938,604]) exp_means = [[43.5, 51.75], [29.75, 34.75], [41.5, 40.0], [50.5, 53.75], [28.0, 25.5], [41.75, 54.0]] seed(0) obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'nonparametric_t_test', GROUP_TEST_CHOICES, reps=1000) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with parametric t test # bt_1 agrees with Prism sample_indices = {'cat1': [4, 1, 2], 'cat2': [5, 0, 3]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [-1.0504514628777806, -0.94113003446934629, -0.66264262463016887, 0.17617555832772411, 1.1144416530351877, -1.2483315640812607] exp_pvals = [0.3527834167236007, 0.39992473225679626, 0.5437923932346147, 0.8687158192049661, 0.32753202812350557, 0.27998887149482976] exp_means = [[39.666666666666664, 61.0], [24.333333333333332, 40.0], [27.0, 42.333333333333336], [57.0, 54.333333333333336], [38.333333333333336, 19.666666666666668], [30.333333333333332, 60.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'parametric_t_test', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with BT_4 sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [0.43577690622483684, -2.5911938781738648, -1.3573515147239095, 1.2101173913086851, 2.137178815882979, 0.0099191576638653078] exp_pvals = [0.67823972846362579, 0.041145883121579255, 0.2235024418313547, 0.27174025956151748, 0.076447615888438444, 0.9924073718332862] exp_means = [[52.25, 43.0], [20.5, 44.0], [29.25, 52.25], [59.75, 44.5], [39.0, 14.5], [48.0, 47.75]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'parametric_t_test', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with bootstrapped mann_whitney_u sample_indices = {'cat1': [4, 1, 2], 'cat2': [5, 0, 3]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [7.0, 7.0, 7.0, 6.0, 7.0, 7.0] exp_pvals = [0.333, 0.305, 0.3, 0.623, 0.295, 0.334] exp_means = [[39.666666666666664, 61.0], [24.333333333333332, 40.0], [27.0, 42.333333333333336], [57.0, 54.333333333333336], [38.333333333333336, 19.666666666666668], [30.333333333333332, 60.0]] seed(0) # seed prng for reproducibility obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'bootstrap_mann_whitney_u', GROUP_TEST_CHOICES, reps=1000) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with BT_4 sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [10.0, 15.0, 11.0, 14.0, 15.0, 9.0] exp_pvals = [0.605, 0.033, 0.414, 0.097, 0.041, 0.814] exp_means = [[52.25, 43.0], [20.5, 44.0], [29.25, 52.25], [59.75, 44.5], [39.0, 14.5], [48.0, 47.75]] seed(0) # seed prng for reproducibility obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'bootstrap_mann_whitney_u', GROUP_TEST_CHOICES, reps=1000) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with parametric mann whitney u sample_indices = {'cat1': [0, 3, 1], 'cat2': [4, 2, 5]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [6.0, 6.0, 5.0, 5.0, 6.0, 5.0] exp_pvals = [0.51269076026192328, 0.51269076026192328, 0.82725934656271127, 0.82725934656271127, 0.51269076026192328, 0.82725934656271127] exp_means = [[52.666666666666664, 48.0], [23.666666666666668, 40.666666666666664], [34.0, 35.333333333333336], [56.333333333333336, 55.0], [32.333333333333336, 25.666666666666668], [46.0, 44.333333333333336]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'mann_whitney_u', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with BT_4 sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [10.0, 15.0, 11.0, 14.0, 15.0, 9.0] exp_pvals = [0.5637028616507731, 0.043308142810791955, 0.38363032713198975, 0.083264516663550406, 0.043308142810791955, 0.77282999268444752] exp_means = [[52.25, 43.0], [20.5, 44.0], [29.25, 52.25], [59.75, 44.5], [39.0, 14.5], [48.0, 47.75]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'mann_whitney_u', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with ANOVA sample_indices = {'cat1': [0, 3], 'cat2': [4, 5], 'cat3': [2, 1]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.022340083574413375, 20.028268551236753, 2.086854460093897, 0.96500593119810185, 4.8390804597701154, 0.54346882684796749] exp_pvals = [0.97806870848824634, 0.018391757629969238, 0.27043709109167957, 0.47468983920325486, 0.11510587547067222, 0.62890473306440042] exp_means = [[53.0, 46.5, 51.5], [28.5, 55.5, 12.5], [50.0, 45.5, 8.5], [50.5, 47.5, 69.0], [28.0, 9.0, 50.0], [65.0, 39.5, 31.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'ANOVA', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with uneven group sizes sample_indices = {'cat1': [0, 2 ,3, 1], 'cat2': [4, 5]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.05663963168179019, 16.436058700209646, 0.43828937472444823, 0.675244322576109, 4.7713717693836974, 0.083541102077687446] exp_pvals = [0.8235822412182755, 0.015422975290359022, 0.54414414026513325, 0.45738578176242134, 0.094285405564661875, 0.78691584834507211] exp_means = [[52.25, 46.5], [20.5, 55.5], [29.25, 45.5], [59.75, 47.5], [39.0, 9.0], [48.0, 39.5]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'ANOVA', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with bt_4 sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [0.18990151199889027, 6.7142857142857144, 1.8424031345232912, 1.4643841007477372, 4.5675332910589734, 9.8389688760617899e-05] exp_pvals = [0.6782397284636259, 0.041145883121579234, 0.22350244183135481, 0.27174025956151771, 0.076447615888438403, 0.9924073718332751] exp_means = [[52.25, 43.0], [20.5, 44.0], [29.25, 52.25], [59.75, 44.5], [39.0, 14.5], [48.0, 47.75]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'ANOVA', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with g goodness of fit sample_indices = {'cat1': [0, 3], 'cat2': [4, 5], 'cat3': [2, 1]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.46328913071721711, 29.810689447160001, 37.234612591840595, 4.7031232724401875, 31.207185565457102, 13.332324853339509] exp_pvals = [0.79322801392154108, 3.3627225458535774e-07, 8.2149818410655555e-09, 0.09522034650579822, 1.6728066897036456e-07, 0.00127327567601971] exp_means = [[53.0, 46.5, 51.5], [28.5, 55.5, 12.5], [50.0, 45.5, 8.5], [50.5, 47.5, 69.0], [28.0, 9.0, 50.0], [65.0, 39.5, 31.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'g_test', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with individual groups sample_indices = {'cat1': [0], 'cat2': [1], 'cat3': [3], 'cat4': [2], 'cat5': [5], 'cat6': [4]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [68.7536611489639, 62.908926545455522, 115.84654226008865, 26.819713749563704, 84.940231595557307, 105.37909384565077] exp_pvals = [1.8616725644907271e-13, 3.0403858229558975e-12, 2.3772983815049693e-23, 6.1843461955812955e-05, 7.7481603433718027e-17, 3.8768150325829967e-21] exp_means = [[28.0, 52.0, 78.0, 51.0, 77.0, 16.0], [25.0, 14.0, 32.0, 11.0, 63.0, 48.0], [31.0, 2.0, 69.0, 15.0, 27.0, 64.0], [36.0, 68.0, 65.0, 70.0, 62.0, 33.0], [16.0, 41.0, 40.0, 59.0, 3.0, 15.0], [32.0, 8.0, 98.0, 54.0, 50.0, 29.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'g_test', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with uneven length groups sample_indices = {'cat1': [0, 3, 4, 5], 'cat3': [2, 1]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.030099778845406742, 16.703388149486191, 29.941854048163027, 3.39187772427496, 14.935738277477988, 5.4519230964604013] exp_pvals = [0.86226402523867973, 4.3702877865113464e-05, 4.451983032513133e-08, 0.065518295867083964, 0.00011123571448583719, 0.019546798231055287] exp_means = [[49.75, 51.5], [42.0, 12.5], [47.75, 8.5], [49.0, 69.0], [18.5, 50.0], [52.25, 31.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'g_test', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with bt_4 sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [0.8950130401309585, 8.6948783805472942, 6.5397009199496443, 2.2281537448054953, 11.541070115516771, 0.00064935138712822981] exp_pvals = [0.34412242732851783, 0.0031910540870178925, 0.010549308294222293, 0.13551569348660794, 0.00068075444949030543, 0.97967020739471489] exp_means = [[52.25, 43.0], [20.5, 44.0], [29.25, 52.25], [59.75, 44.5], [39.0, 14.5], [48.0, 47.75]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'g_test', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with Kruskal Wallis sample_indices = {'cat1': [0, 3], 'cat2': [4, 5], 'cat3': [2, 1]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.2857142857142847, 4.5714285714285694, 3.7142857142857117, 3.7142857142857117, 4.5714285714285694, 0.85714285714285765] exp_pvals = [0.86687789975018215, 0.10170139230422694, 0.15611804531597129, 0.15611804531597129, 0.10170139230422694, 0.65143905753105535] exp_means = [[53.0, 46.5, 51.5], [28.5, 55.5, 12.5], [50.0, 45.5, 8.5], [50.5, 47.5, 69.0], [28.0, 9.0, 50.0], [65.0, 39.5, 31.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'kruskal_wallis', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with individual groups sample_indices = {'cat1': [0], 'cat2': [1], 'cat3': [3], 'cat4': [2], 'cat5': [5], 'cat6': [4]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [5.0, 5.0, 5.0, 5.0, 5.0, 5.0] exp_pvals = [0.41588018699550794, 0.41588018699550794, 0.41588018699550794, 0.41588018699550794, 0.41588018699550794, 0.41588018699550794] exp_means = [[28.0, 52.0, 78.0, 51.0, 77.0, 16.0], [25.0, 14.0, 32.0, 11.0, 63.0, 48.0], [31.0, 2.0, 69.0, 15.0, 27.0, 64.0], [36.0, 68.0, 65.0, 70.0, 62.0, 33.0], [16.0, 41.0, 40.0, 59.0, 3.0, 15.0], [32.0, 8.0, 98.0, 54.0, 50.0, 29.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'kruskal_wallis', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with uneven length groups sample_indices = {'cat1': [0, 3, 4, 5], 'cat3': [2, 1]} row_gen = group_significance_row_generator(bt, sample_indices) exp_test_stats = [0.0, 3.428571428571427, 3.428571428571427, 3.428571428571427, 3.428571428571427, 0.21428571428571175] exp_pvals = [1, 0.064077506451059238, 0.064077506451059238, 0.064077506451059238, 0.064077506451059238, 0.64342884356362262] exp_means = [[49.75, 51.5], [42.0, 12.5], [47.75, 8.5], [49.0, 69.0], [18.5, 50.0], [52.25, 31.0]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'kruskal_wallis', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means) # test with bt_4 sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]} row_gen = group_significance_row_generator(bt_4, sample_indices) exp_test_stats = [0.33333333333333215, 4.0833333333333321, 0.75903614457831325, 3.0, 4.0833333333333321, 0.083333333333332149] exp_pvals = [0.56370286165077377, 0.043308142810792101, 0.38363032713198986, 0.08326451666355042, 0.043308142810792101, 0.77282999268444919] exp_means = [[52.25, 43.0], [20.5, 44.0], [29.25, 52.25], [59.75, 44.5], [39.0, 14.5], [48.0, 47.75]] obs_test_stats, obs_pvals, obs_means = \ run_group_significance_test(row_gen, 'kruskal_wallis', GROUP_TEST_CHOICES) self.assertFloatEqual(exp_test_stats, obs_test_stats) self.assertFloatEqual(exp_pvals, obs_pvals) self.assertFloatEqual(exp_means, obs_means)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # sync the mapping file and the biom file tmp_bt = load_table(opts.otu_table_fp) tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt) # test error conditions for overlapping mf and bt if not opts.biom_samples_are_superset: # user indicates biom sample should be subset of mapping file samples if any([i in nonshared_samples for i in tmp_bt.ids()]): raise ValueError('The samples in the biom table are a superset of' + ' the samples in the mapping file. The script will abort in' + ' this case even though the calculations wouldn\'t be' + ' affected, to ensure consistency within QIIME. Pass the' + ' --biom_samples_are_superset option to disable this behavior.') # user wants non-overlapping samples printed out if opts.print_non_overlap: print 'The following samples were not shared between the mapping file' +\ ' and the biom file and will not be included in the analysis:\n' +\ ' '.join(nonshared_samples) # find group indices sam_cats = get_sample_cats(pmf, opts.category) cat_sam_groups = get_cat_sample_groups(sam_cats) cat_sam_indices = get_sample_indices(cat_sam_groups, bt) # sanity check to prevent inscrutable errors later if not all([len(v) > 0 for k, v in cat_sam_indices.items()]): raise ValueError('At least one metadata group has no samples. Check ' + 'that the mapping file has at least one sample for each value in ' + 'the passed category.') if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2: option_parser.error('The t-test and mann_whitney_u test may ' + 'only be used when there are two sample groups. Choose another ' + 'test or another metadata category.') # check that assumptions are met for a given test: if opts.test == 'mann_whitney_u': sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values()) if sams <= 20: raise ValueError('The number of samples is too small to use the ' + 'Mann-Whitney-U normal approximation. Review the script ' + 'documentation.') # check that the G-test was not selected if the table appears to be # relative abundance if opts.test == 'g_test': if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.): raise ValueError('It appears that the biom table you have passed ' 'is a relative abundance table where values i,j (obsevation i ' 'count in sample j) are fractional and the sum of the columns ' 'is 1.0. This will fail to work properly with the G-test. If ' 'your data sums to 1 in each column but your data is not ' 'relative abundance then the tests will fail anyway because ' 'of the reduced number of observations.') # run actual tests data_feed = group_significance_row_generator(bt, cat_sam_indices) test_stats, pvals, means = run_group_significance_test( data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations)) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals) bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals) # write output results after sorting lines = group_significance_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # sync the mapping file and the biom file tmp_bt = load_table(opts.otu_table_fp) tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt) # test error conditions for overlapping mf and bt if not opts.biom_samples_are_superset: # user indicates biom sample should be subset of mapping file samples if any([i in nonshared_samples for i in tmp_bt.ids()]): raise ValueError( 'The samples in the biom table are a superset of' + ' the samples in the mapping file. The script will abort in' + ' this case even though the calculations wouldn\'t be' + ' affected, to ensure consistency within QIIME. Pass the' + ' --biom_samples_are_superset option to disable this behavior.' ) # user wants non-overlapping samples printed out if opts.print_non_overlap: print 'The following samples were not shared between the mapping file' +\ ' and the biom file and will not be included in the analysis:\n' +\ ' '.join(nonshared_samples) # find group indices sam_cats = get_sample_cats(pmf, opts.category) cat_sam_groups = get_cat_sample_groups(sam_cats) cat_sam_indices = get_sample_indices(cat_sam_groups, bt) # sanity check to prevent inscrutable errors later if not all([len(v) > 0 for k, v in cat_sam_indices.items()]): raise ValueError( 'At least one metadata group has no samples. Check ' + 'that the mapping file has at least one sample for each value in ' + 'the passed category.') if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2: option_parser.error( 'The t-test and mann_whitney_u test may ' + 'only be used when there are two sample groups. Choose another ' + 'test or another metadata category.') # check that assumptions are met for a given test: if opts.test == 'mann_whitney_u': sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values()) if sams <= 20: raise ValueError( 'The number of samples is too small to use the ' + 'Mann-Whitney-U normal approximation. Review the script ' + 'documentation.') # check that the G-test was not selected if the table appears to be # relative abundance if opts.test == 'g_test': if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.): raise ValueError( 'It appears that the biom table you have passed ' 'is a relative abundance table where values i,j (obsevation i ' 'count in sample j) are fractional and the sum of the columns ' 'is 1.0. This will fail to work properly with the G-test. If ' 'your data sums to 1 in each column but your data is not ' 'relative abundance then the tests will fail anyway because ' 'of the reduced number of observations.') # run actual tests data_feed = group_significance_row_generator(bt, cat_sam_indices) test_stats, pvals, means = run_group_significance_test( data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations)) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals) bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals) # write output results after sorting lines = group_significance_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.close()