def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # sync the mapping file and the biom file tmp_bt = load_table(opts.otu_table_fp) tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt) # test error conditions for overlapping mf and bt if not opts.biom_samples_are_superset: # user indicates biom sample should be subset of mapping file samples if any([i in nonshared_samples for i in tmp_bt.ids()]): raise ValueError('The samples in the biom table are a superset of' + ' the samples in the mapping file. The script will abort in' + ' this case even though the calculations wouldn\'t be' + ' affected, to ensure consistency within QIIME. Pass the' + ' --biom_samples_are_superset option to disable this behavior.') # user wants non-overlapping samples printed out if opts.print_non_overlap: print 'The following samples were not shared between the mapping file' +\ ' and the biom file and will not be included in the analysis:\n' +\ ' '.join(nonshared_samples) # find group indices sam_cats = get_sample_cats(pmf, opts.category) cat_sam_groups = get_cat_sample_groups(sam_cats) cat_sam_indices = get_sample_indices(cat_sam_groups, bt) # sanity check to prevent inscrutable errors later if not all([len(v) > 0 for k, v in cat_sam_indices.items()]): raise ValueError('At least one metadata group has no samples. Check ' + 'that the mapping file has at least one sample for each value in ' + 'the passed category.') if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2: option_parser.error('The t-test and mann_whitney_u test may ' + 'only be used when there are two sample groups. Choose another ' + 'test or another metadata category.') # check that assumptions are met for a given test: if opts.test == 'mann_whitney_u': sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values()) if sams <= 20: raise ValueError('The number of samples is too small to use the ' + 'Mann-Whitney-U normal approximation. Review the script ' + 'documentation.') # check that the G-test was not selected if the table appears to be # relative abundance if opts.test == 'g_test': if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.): raise ValueError('It appears that the biom table you have passed ' 'is a relative abundance table where values i,j (obsevation i ' 'count in sample j) are fractional and the sum of the columns ' 'is 1.0. This will fail to work properly with the G-test. If ' 'your data sums to 1 in each column but your data is not ' 'relative abundance then the tests will fail anyway because ' 'of the reduced number of observations.') # run actual tests data_feed = group_significance_row_generator(bt, cat_sam_indices) test_stats, pvals, means = run_group_significance_test( data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations)) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals) bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals) # write output results after sorting lines = group_significance_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # sync the mapping file and the biom file tmp_bt = load_table(opts.otu_table_fp) tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt) # test error conditions for overlapping mf and bt if not opts.biom_samples_are_superset: # user indicates biom sample should be subset of mapping file samples if any([i in nonshared_samples for i in tmp_bt.ids()]): raise ValueError( 'The samples in the biom table are a superset of' + ' the samples in the mapping file. The script will abort in' + ' this case even though the calculations wouldn\'t be' + ' affected, to ensure consistency within QIIME. Pass the' + ' --biom_samples_are_superset option to disable this behavior.' ) # user wants non-overlapping samples printed out if opts.print_non_overlap: print 'The following samples were not shared between the mapping file' +\ ' and the biom file and will not be included in the analysis:\n' +\ ' '.join(nonshared_samples) # find group indices sam_cats = get_sample_cats(pmf, opts.category) cat_sam_groups = get_cat_sample_groups(sam_cats) cat_sam_indices = get_sample_indices(cat_sam_groups, bt) # sanity check to prevent inscrutable errors later if not all([len(v) > 0 for k, v in cat_sam_indices.items()]): raise ValueError( 'At least one metadata group has no samples. Check ' + 'that the mapping file has at least one sample for each value in ' + 'the passed category.') if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2: option_parser.error( 'The t-test and mann_whitney_u test may ' + 'only be used when there are two sample groups. Choose another ' + 'test or another metadata category.') # check that assumptions are met for a given test: if opts.test == 'mann_whitney_u': sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values()) if sams <= 20: raise ValueError( 'The number of samples is too small to use the ' + 'Mann-Whitney-U normal approximation. Review the script ' + 'documentation.') # check that the G-test was not selected if the table appears to be # relative abundance if opts.test == 'g_test': if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.): raise ValueError( 'It appears that the biom table you have passed ' 'is a relative abundance table where values i,j (obsevation i ' 'count in sample j) are fractional and the sum of the columns ' 'is 1.0. This will fail to work properly with the G-test. If ' 'your data sums to 1 in each column but your data is not ' 'relative abundance then the tests will fail anyway because ' 'of the reduced number of observations.') # run actual tests data_feed = group_significance_row_generator(bt, cat_sam_indices) test_stats, pvals, means = run_group_significance_test( data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations)) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals) bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals) # write output results after sorting lines = group_significance_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.close()