Exemple #1
0
    def test_sort_by_pval(self):
        """sort_by_pval works"""
        lines = ['OTU\tTest-Statistic\tP\tFDR_P\tBonferroni_P\tcat1_mean\tcat2_mean\tTaxonomy',
             'OTU1\t0.189901511999\t0.678239728464\t0.813887674156\t4.06943837078\t52.25\t43.0\tk__One',
             'OTU2\t6.71428571429\t0.0411458831216\t0.246875298729\t0.246875298729\t20.5\t44.0\tk__Two',
             'OTU3\t1.84240313452\t0.223502441831\t0.447004883663\t1.34101465099\t29.25\t52.25\tk__Three',
             'OTU4\t1.46438410075\t0.271740259562\t0.407610389342\t1.63044155737\t59.75\t44.5\tk__Four',
             'OTU5\t4.56753329106\t0.0764476158884\t0.229342847665\t0.458685695331\t39.0\t14.5\tk__Five',
             'OTU6\t9.83896887606e-05\t0.992407371833\t0.992407371833\t5.954444231\t48.0\t47.75\tk__Six']

        lines_sorted_pval_1 = \
            'OTU2\t6.71428571429\t0.0411458831216\t0.246875298729\t0.246875298729\t20.5\t44.0\tk__Two'
        lines_sorted_fdr_1 = \
            'OTU5\t4.56753329106\t0.0764476158884\t0.229342847665\t0.458685695331\t39.0\t14.5\tk__Five'
        lines_sorted_bonf_6 = \
            'OTU6\t9.83896887606e-05\t0.992407371833\t0.992407371833\t5.954444231\t48.0\t47.75\tk__Six'

        lines_pval = sort_by_pval(lines, 2)
        lines_pval_fdr = sort_by_pval(lines, 3)
        lines_pval_bonf = sort_by_pval(lines, 4)

        self.assertEqual(lines_pval[1], lines_sorted_pval_1)
        self.assertEqual(lines_pval_fdr[1], lines_sorted_fdr_1)
        self.assertEqual(lines_pval_bonf[6], lines_sorted_bonf_6)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError('The samples in the biom table are a superset of' +
                             ' the samples in the mapping file. The script will abort in' +
                             ' this case even though the calculations wouldn\'t be' +
                             ' affected, to ensure consistency within QIIME. Pass the' +
                             ' --biom_samples_are_superset option to disable this behavior.')
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError('At least one metadata group has no samples. Check ' +
                         'that the mapping file has at least one sample for each value in ' +
                         'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error('The t-test and mann_whitney_u test may ' +
                            'only be used when there are two sample groups. Choose another ' +
                            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError('The number of samples is too small to use the ' +
                             'Mann-Whitney-U normal approximation. Review the script ' +
                             'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError('It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test,
        GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt, test_stats, pvals,
                                                fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped':
        option_parser.error(cscore_error_text)

    bt = load_table(opts.otu_table_fp)
    pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)

    samples_to_correlate = []
    md_values_to_correlate = []
    bt_sample_ids = bt.ids(axis='sample')

    for sample_id, sample_md in pmf.items():
        if sample_id in bt_sample_ids:
            try:
                v = is_computable_float(sample_md[opts.category])
                samples_to_correlate.append(sample_id)
                md_values_to_correlate.append(v)
            except KeyError:
                option_parser.error('The category (%s)' % opts.category +
                                    ' was not found in the mapping file.')
            except ValueError:
                pass  # value couldn't be converted to float, ignore this sample
        else:
            pass  # sample in mf, but not bt

    # remove samples which are not found in the mapping file or do not have
    # metadata that converts to float
    bt.filter(ids_to_keep=samples_to_correlate, axis='sample')

    # sort the biom table so that feature values are retrieved in the same
    # order as the metadata in the samples they correspond to
    bt = bt.sort(sort_f=lambda _: samples_to_correlate, axis='sample')

    if bt.shape[1] <= 3:
        option_parser.error(filtration_error_text)

    rhos = []
    pvals = []
    for feature_vector in bt.iter_data(axis='observation'):
        rho = correlate(feature_vector,
                        md_values_to_correlate,
                        method=opts.test)
        pval = assign_correlation_pval(rho, len(feature_vector),
                                       method=opts.pval_assignment_method,
                                       permutations=opts.permutations,
                                       perm_test_fn=\
                                            bootstrap_functions[opts.test],
                                       v1=feature_vector,
                                       v2=md_values_to_correlate)
        rhos.append(rho)
        pvals.append(pval)

    fdr_pvals = benjamini_hochberg_step_down(pvals)
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)

    lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals,
                                       opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)

    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.write('\n')
    o.close()
Exemple #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError(
                'The samples in the biom table are a superset of' +
                ' the samples in the mapping file. The script will abort in' +
                ' this case even though the calculations wouldn\'t be' +
                ' affected, to ensure consistency within QIIME. Pass the' +
                ' --biom_samples_are_superset option to disable this behavior.'
            )
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError(
            'At least one metadata group has no samples. Check ' +
            'that the mapping file has at least one sample for each value in '
            + 'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error(
            'The t-test and mann_whitney_u test may ' +
            'only be used when there are two sample groups. Choose another ' +
            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError(
                'The number of samples is too small to use the ' +
                'Mann-Whitney-U normal approximation. Review the script ' +
                'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError(
                'It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt,
                                                test_stats,
                                                pvals,
                                                fdr_pvals,
                                                bon_pvals,
                                                means,
                                                cat_sam_indices,
                                                md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped':
        option_parser.error(cscore_error_text)

    bt = load_table(opts.otu_table_fp)
    pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)

    samples_to_correlate = []
    md_values_to_correlate = []
    bt_sample_ids = bt.ids(axis='sample')

    for sample_id, sample_md in pmf.items():
        if sample_id in bt_sample_ids:
            try:
                v = is_computable_float(sample_md[opts.category])
                samples_to_correlate.append(sample_id)
                md_values_to_correlate.append(v)
            except KeyError:
                option_parser.error('The category (%s)' % opts.category +
                    ' was not found in the mapping file.')
            except ValueError:
                pass  # value couldn't be converted to float, ignore this sample
        else:
            pass  # sample in mf, but not bt

    # remove samples which are not found in the mapping file or do not have
    # metadata that converts to float
    bt.filter(ids_to_keep = samples_to_correlate, axis='sample')

    # sort the biom table so that feature values are retrieved in the same
    # order as the metadata in the samples they correspond to
    bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample')

    if bt.shape[1] <= 3:
        option_parser.error(filtration_error_text)

    rhos = []
    pvals = []
    for feature_vector in bt.iter_data(axis='observation'):
        rho = correlate(feature_vector, md_values_to_correlate,
                        method=opts.test)
        pval = assign_correlation_pval(rho, len(feature_vector),
                                       method=opts.pval_assignment_method,
                                       permutations=opts.permutations,
                                       perm_test_fn=\
                                            bootstrap_functions[opts.test],
                                       v1=feature_vector,
                                       v2=md_values_to_correlate)
        rhos.append(rho)
        pvals.append(pval)

    fdr_pvals = benjamini_hochberg_step_down(pvals)
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)

    lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals,
                                       bon_pvals, opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)

    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.write('\n')
    o.close()