Beispiel #1
0
    def test_group_significance_output_formatter(self):
        """output_formatter works"""
        #Using ANOVA test for example
        otu_table_1 = """{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-08-16T15:23:02.872397","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 8],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[0,6,73.0],[0,7,6.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[1,6,27.0],[1,7,38.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[2,6,64.0],[2,7,54.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[3,6,60.0],[3,7,23.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[4,6,35.0],[4,7,5.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0],[5,6,93.0],[5,7,19.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": "k__One"}},{"id": "OTU2", "metadata": {"taxonomy": "k__Two"}},{"id": "OTU3", "metadata": {"taxonomy": "k__Three"}},{"id": "OTU4", "metadata": {"taxonomy": "k__Four"}},{"id": "OTU5", "metadata": {"taxonomy": "k__Five"}},{"id": "OTU6", "metadata": {"taxonomy": "k__Six"}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null},{"id": "Sample7", "metadata": null},{"id": "Sample8", "metadata": null}]}"""
        bt = parse_biom_table(otu_table_1)

        test_stats = [0.18990151199889027,
             6.7142857142857144,
             1.8424031345232912,
             1.4643841007477372,
             4.5675332910589734,
             9.8389688760617899e-05]
        pvals = [0.6782397284636259,
             0.041145883121579234,
             0.22350244183135481,
             0.27174025956151771,
             0.076447615888438403,
             0.9924073718332751]
        means = [[52.25, 43.0],
             [20.5, 44.0],
             [29.25, 52.25],
             [59.75, 44.5],
             [39.0, 14.5],
             [48.0, 47.75]]
        fdr_pvals = [0.8138876741563511,
             0.24687529872947539,
             0.44700488366270963,
             0.40761038934227656,
             0.22934284766531521,
             0.9924073718332751]
        bon_pvals = [4.069438370781755,
             0.2468752987294754,
             1.3410146509881289,
             1.6304415573691062,
             0.4586856953306304,
             5.95444423099965]
        cat_sample_indices = {'cat1': [0, 1, 2, 3], 'cat2': [4, 5, 6, 7]}

        line_header_out = 'OTU\tTest-Statistic\tP\tFDR_P\tBonferroni_P\tcat1_mean\tcat2_mean\ttaxonomy'
        line_1_out = 'OTU1\t0.189901511999\t0.678239728464\t0.813887674156\t4.06943837078\t52.25\t43.0\tk__One'
        line_6_out = 'OTU6\t9.83896887606e-05\t0.992407371833\t0.992407371833\t5.954444231\t48.0\t47.75\tk__Six'

        lines = group_significance_output_formatter(bt, test_stats, pvals, 
            fdr_pvals, bon_pvals, means, cat_sample_indices, md_key='taxonomy')

        self.assertEqual(lines[0], line_header_out)
        self.assertEqual(lines[1], line_1_out)
        self.assertEqual(lines[6], line_6_out)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError('The samples in the biom table are a superset of' +
                             ' the samples in the mapping file. The script will abort in' +
                             ' this case even though the calculations wouldn\'t be' +
                             ' affected, to ensure consistency within QIIME. Pass the' +
                             ' --biom_samples_are_superset option to disable this behavior.')
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError('At least one metadata group has no samples. Check ' +
                         'that the mapping file has at least one sample for each value in ' +
                         'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error('The t-test and mann_whitney_u test may ' +
                            'only be used when there are two sample groups. Choose another ' +
                            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError('The number of samples is too small to use the ' +
                             'Mann-Whitney-U normal approximation. Review the script ' +
                             'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError('It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test,
        GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt, test_stats, pvals,
                                                fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()
Beispiel #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError(
                'The samples in the biom table are a superset of' +
                ' the samples in the mapping file. The script will abort in' +
                ' this case even though the calculations wouldn\'t be' +
                ' affected, to ensure consistency within QIIME. Pass the' +
                ' --biom_samples_are_superset option to disable this behavior.'
            )
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError(
            'At least one metadata group has no samples. Check ' +
            'that the mapping file has at least one sample for each value in '
            + 'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error(
            'The t-test and mann_whitney_u test may ' +
            'only be used when there are two sample groups. Choose another ' +
            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError(
                'The number of samples is too small to use the ' +
                'Mann-Whitney-U normal approximation. Review the script ' +
                'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError(
                'It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt,
                                                test_stats,
                                                pvals,
                                                fdr_pvals,
                                                bon_pvals,
                                                means,
                                                cat_sam_indices,
                                                md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()