def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # sync the mapping file and the biom file tmp_bt = load_table(opts.otu_table_fp) tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt) # test error conditions for overlapping mf and bt if not opts.biom_samples_are_superset: # user indicates biom sample should be subset of mapping file samples if any([i in nonshared_samples for i in tmp_bt.ids()]): raise ValueError('The samples in the biom table are a superset of' + ' the samples in the mapping file. The script will abort in' + ' this case even though the calculations wouldn\'t be' + ' affected, to ensure consistency within QIIME. Pass the' + ' --biom_samples_are_superset option to disable this behavior.') # user wants non-overlapping samples printed out if opts.print_non_overlap: print 'The following samples were not shared between the mapping file' +\ ' and the biom file and will not be included in the analysis:\n' +\ ' '.join(nonshared_samples) # find group indices sam_cats = get_sample_cats(pmf, opts.category) cat_sam_groups = get_cat_sample_groups(sam_cats) cat_sam_indices = get_sample_indices(cat_sam_groups, bt) # sanity check to prevent inscrutable errors later if not all([len(v) > 0 for k, v in cat_sam_indices.items()]): raise ValueError('At least one metadata group has no samples. Check ' + 'that the mapping file has at least one sample for each value in ' + 'the passed category.') if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2: option_parser.error('The t-test and mann_whitney_u test may ' + 'only be used when there are two sample groups. Choose another ' + 'test or another metadata category.') # check that assumptions are met for a given test: if opts.test == 'mann_whitney_u': sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values()) if sams <= 20: raise ValueError('The number of samples is too small to use the ' + 'Mann-Whitney-U normal approximation. Review the script ' + 'documentation.') # check that the G-test was not selected if the table appears to be # relative abundance if opts.test == 'g_test': if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.): raise ValueError('It appears that the biom table you have passed ' 'is a relative abundance table where values i,j (obsevation i ' 'count in sample j) are fractional and the sum of the columns ' 'is 1.0. This will fail to work properly with the G-test. If ' 'your data sums to 1 in each column but your data is not ' 'relative abundance then the tests will fail anyway because ' 'of the reduced number of observations.') # run actual tests data_feed = group_significance_row_generator(bt, cat_sam_indices) test_stats, pvals, means = run_group_significance_test( data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations)) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals) bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals) # write output results after sorting lines = group_significance_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': option_parser.error(cscore_error_text) bt = load_table(opts.otu_table_fp) pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) samples_to_correlate = [] md_values_to_correlate = [] bt_sample_ids = bt.ids(axis='sample') for sample_id, sample_md in pmf.items(): if sample_id in bt_sample_ids: try: v = is_computable_float(sample_md[opts.category]) samples_to_correlate.append(sample_id) md_values_to_correlate.append(v) except KeyError: option_parser.error('The category (%s)' % opts.category + ' was not found in the mapping file.') except ValueError: pass # value couldn't be converted to float, ignore this sample else: pass # sample in mf, but not bt # remove samples which are not found in the mapping file or do not have # metadata that converts to float bt.filter(ids_to_keep=samples_to_correlate, axis='sample') # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to bt = bt.sort(sort_f=lambda _: samples_to_correlate, axis='sample') if bt.shape[1] <= 3: option_parser.error(filtration_error_text) rhos = [] pvals = [] for feature_vector in bt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) pval = assign_correlation_pval(rho, len(feature_vector), method=opts.pval_assignment_method, permutations=opts.permutations, perm_test_fn=\ bootstrap_functions[opts.test], v1=feature_vector, v2=md_values_to_correlate) rhos.append(rho) pvals.append(pval) fdr_pvals = benjamini_hochberg_step_down(pvals) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.write('\n') o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # sync the mapping file and the biom file tmp_bt = load_table(opts.otu_table_fp) tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt) # test error conditions for overlapping mf and bt if not opts.biom_samples_are_superset: # user indicates biom sample should be subset of mapping file samples if any([i in nonshared_samples for i in tmp_bt.ids()]): raise ValueError( 'The samples in the biom table are a superset of' + ' the samples in the mapping file. The script will abort in' + ' this case even though the calculations wouldn\'t be' + ' affected, to ensure consistency within QIIME. Pass the' + ' --biom_samples_are_superset option to disable this behavior.' ) # user wants non-overlapping samples printed out if opts.print_non_overlap: print 'The following samples were not shared between the mapping file' +\ ' and the biom file and will not be included in the analysis:\n' +\ ' '.join(nonshared_samples) # find group indices sam_cats = get_sample_cats(pmf, opts.category) cat_sam_groups = get_cat_sample_groups(sam_cats) cat_sam_indices = get_sample_indices(cat_sam_groups, bt) # sanity check to prevent inscrutable errors later if not all([len(v) > 0 for k, v in cat_sam_indices.items()]): raise ValueError( 'At least one metadata group has no samples. Check ' + 'that the mapping file has at least one sample for each value in ' + 'the passed category.') if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2: option_parser.error( 'The t-test and mann_whitney_u test may ' + 'only be used when there are two sample groups. Choose another ' + 'test or another metadata category.') # check that assumptions are met for a given test: if opts.test == 'mann_whitney_u': sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values()) if sams <= 20: raise ValueError( 'The number of samples is too small to use the ' + 'Mann-Whitney-U normal approximation. Review the script ' + 'documentation.') # check that the G-test was not selected if the table appears to be # relative abundance if opts.test == 'g_test': if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.): raise ValueError( 'It appears that the biom table you have passed ' 'is a relative abundance table where values i,j (obsevation i ' 'count in sample j) are fractional and the sum of the columns ' 'is 1.0. This will fail to work properly with the G-test. If ' 'your data sums to 1 in each column but your data is not ' 'relative abundance then the tests will fail anyway because ' 'of the reduced number of observations.') # run actual tests data_feed = group_significance_row_generator(bt, cat_sam_indices) test_stats, pvals, means = run_group_significance_test( data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations)) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals) bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals) # write output results after sorting lines = group_significance_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': option_parser.error(cscore_error_text) bt = load_table(opts.otu_table_fp) pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) samples_to_correlate = [] md_values_to_correlate = [] bt_sample_ids = bt.ids(axis='sample') for sample_id, sample_md in pmf.items(): if sample_id in bt_sample_ids: try: v = is_computable_float(sample_md[opts.category]) samples_to_correlate.append(sample_id) md_values_to_correlate.append(v) except KeyError: option_parser.error('The category (%s)' % opts.category + ' was not found in the mapping file.') except ValueError: pass # value couldn't be converted to float, ignore this sample else: pass # sample in mf, but not bt # remove samples which are not found in the mapping file or do not have # metadata that converts to float bt.filter(ids_to_keep = samples_to_correlate, axis='sample') # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') if bt.shape[1] <= 3: option_parser.error(filtration_error_text) rhos = [] pvals = [] for feature_vector in bt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) pval = assign_correlation_pval(rho, len(feature_vector), method=opts.pval_assignment_method, permutations=opts.permutations, perm_test_fn=\ bootstrap_functions[opts.test], v1=feature_vector, v2=md_values_to_correlate) rhos.append(rho) pvals.append(pval) fdr_pvals = benjamini_hochberg_step_down(pvals) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.write('\n') o.close()