def run_correlation_test(data_generator, test, test_choices, pval_assignment_method, permutations=None): """Run correlation tests. Inputs: data_generator - generator from correlation_row_generator, basically a list of tuples where each tuple contains two arrays. test - str, one of CORRELATION_TEST_CHOICES keys. test_choices - dict, CORRELATION_TEST_CHOICES. pval_assignment_method - str, one of CORRELATION_PVALUE_CHOICES. permutations - int or None, number of permutations to use for bootstrapped methods. """ corr_coefs, pvals = [], [] test_fn = test_choices[test] for otu_vals, md_vals in data_generator: r = test_fn(otu_vals, md_vals) if pval_assignment_method == 'bootstrapped': pval = assign_correlation_pval(r, len(otu_vals), pval_assignment_method, permutations, test_fn, otu_vals, md_vals) else: pval = assign_correlation_pval(r, len(otu_vals), pval_assignment_method) corr_coefs.append(r) pvals.append(pval) return corr_coefs, pvals
def run_grouped_correlation(md_vals, otu_arrays, test, test_choices, pval_assignment_method, permutations=None): """Run grouped correlation test This function runs the grouped correlation test. Briefly, it ingests the metadata values, the arrays of otu values that are to be correlated with them, and the test and pvalue assignment method to use. It calculates the individual correlation coefficients for each group (specified implicitly by the grouping and ordering of md_vals and otu_arrays) and then it combines the corrcoeffs and the pvalues with methods by Fisher. Inputs: md_vals - list of 1d arrays, continuous metadata to be correlated. otu_arrays - list of 1d, otu abundances to be correlated. test - str, one of CORRELATION_TEST_CHOICES keys. test_choices - dict, CORRELATION_TEST_CHOICES. pval_assignment_method - str, one of CORRELATION_PVALUE_CHOICES. permutations - int or None, number of permutations to use for bootstrapped methods. """ test_fn = test_choices[test] sample_sizes = map(len, md_vals) def _rho(otu_vals, md_vals): return test_fn(otu_vals, md_vals) # find the correlations. rhos is list of 1D arrays. rhos = [] for i in range(len(md_vals)): rhos.append(apply_along_axis(_rho, 1, otu_arrays[i], md_vals[i])) pvals = [] for i, group_rhos in enumerate(rhos): pvals_i = zeros(len(group_rhos)) for j, rho in enumerate(group_rhos): pvals_i[j] = assign_correlation_pval(rho, sample_sizes[i], pval_assignment_method, permutations, test_fn, otu_arrays[i][j], md_vals[i]) pvals.append(array(pvals_i)) # calculate combined stats fisher_pvals = apply_along_axis(fisher, 0, array(pvals)) fisher_rho_and_h = apply_along_axis(fisher_population_correlation, 0, array(rhos), sample_sizes) return ((rhos, pvals, fisher_pvals, fisher_rho_and_h[0], fisher_rho_and_h[1]))
def run_grouped_correlation(md_vals, otu_arrays, test, test_choices, pval_assignment_method, permutations=None): """Run grouped correlation test This function runs the grouped correlation test. Briefly, it ingests the metadata values, the arrays of otu values that are to be correlated with them, and the test and pvalue assignment method to use. It calculates the individual correlation coefficients for each group (specified implicitly by the grouping and ordering of md_vals and otu_arrays) and then it combines the corrcoeffs and the pvalues with methods by Fisher. Inputs: md_vals - list of 1d arrays, continuous metadata to be correlated. otu_arrays - list of 1d, otu abundances to be correlated. test - str, one of CORRELATION_TEST_CHOICES keys. test_choices - dict, CORRELATION_TEST_CHOICES. pval_assignment_method - str, one of CORRELATION_PVALUE_CHOICES. permutations - int or None, number of permutations to use for bootstrapped methods. """ test_fn = test_choices[test] sample_sizes = map(len, md_vals) def _rho(otu_vals, md_vals): return test_fn(otu_vals, md_vals) # find the correlations. rhos is list of 1D arrays. rhos = [] for i in range(len(md_vals)): rhos.append(apply_along_axis(_rho, 1, otu_arrays[i], md_vals[i])) pvals = [] for i, group_rhos in enumerate(rhos): pvals_i = zeros(len(group_rhos)) for j, rho in enumerate(group_rhos): pvals_i[j] = assign_correlation_pval(rho, sample_sizes[i], pval_assignment_method, permutations, test_fn, otu_arrays[ i][j], md_vals[i]) pvals.append(array(pvals_i)) # calculate combined stats fisher_pvals = apply_along_axis(fisher, 0, array(pvals)) fisher_rho_and_h = apply_along_axis(fisher_population_correlation, 0, array(rhos), sample_sizes) return ( (rhos, pvals, fisher_pvals, fisher_rho_and_h[0], fisher_rho_and_h[1]) )
def naive_cc_tool(bt, corr_method, pval_assignment_method, cval_fp, pval_fp): '''Calculate co-occurence using naive approach. Inputs: bt - biom table with OTUs to be correlated. corr_method - str, correlation statistics to use, one of pearson, spearmans_rho, or kendalls_tau. pval_assignment_method - str, one of parametric_t_distribution, fisher_z_transform, bootstrapped, kendall. ''' data = array( [bt.data(i, axis='observation') for i in bt.ids(axis='observation')]) r, c = data.shape ccs = zeros((r, r)) ps = zeros((r, r)) test_fn = CORRELATION_TEST_CHOICES[corr_method] for o1 in range(r): for o2 in range(o1 + 1, r): cc = test_fn(data[o1], data[o2]) ccs[o1][o2] = cc # assign correlation pvalues if pval_assignment_method == 'None': ps[o1][o2] = 1.0 else: pval = assign_correlation_pval(cc, len(data[o1]), pval_assignment_method, permutations=1000, perm_test_fn=test_fn, v1=data[o1], v2=data[o2]) ps[o1][o2] = pval # write values header = '#OTU ID\t' + '\t'.join(bt.ids(axis='observation')) clines = [header]+[bt.ids(axis='observation')[i]+'\t'+'\t'.join(map(str,ccs[i])) \ for i in range(r)] plines = [header]+[bt.ids(axis='observation')[i]+'\t'+'\t'.join(map(str,ps[i])) \ for i in range(r)] o = open(cval_fp, 'w') o.writelines('\n'.join(clines)) o.close() o = open(pval_fp, 'w') o.writelines('\n'.join(plines)) o.close()
def naive_cc_tool(bt, corr_method, pval_assignment_method, cval_fp, pval_fp): """Calculate co-occurence using naive approach. Inputs: bt - biom table with OTUs to be correlated. corr_method - str, correlation statistics to use, one of pearson, spearmans_rho, or kendalls_tau. pval_assignment_method - str, one of parametric_t_distribution, fisher_z_transform, bootstrapped, kendall. """ data = array([bt.data(i, axis="observation") for i in bt.ids(axis="observation")]) r, c = data.shape ccs = zeros((r, r)) ps = zeros((r, r)) test_fn = CORRELATION_TEST_CHOICES[corr_method] for o1 in range(r): for o2 in range(o1 + 1, r): cc = test_fn(data[o1], data[o2]) ccs[o1][o2] = cc # assign correlation pvalues if pval_assignment_method == "None": ps[o1][o2] = 1.0 else: pval = assign_correlation_pval( cc, len(data[o1]), pval_assignment_method, permutations=1000, perm_test_fn=test_fn, v1=data[o1], v2=data[o2], ) ps[o1][o2] = pval # write values header = "#OTU ID\t" + "\t".join(bt.ids(axis="observation")) clines = [header] + [bt.ids(axis="observation")[i] + "\t" + "\t".join(map(str, ccs[i])) for i in range(r)] plines = [header] + [bt.ids(axis="observation")[i] + "\t" + "\t".join(map(str, ps[i])) for i in range(r)] o = open(cval_fp, "w") o.writelines("\n".join(clines)) o.close() o = open(pval_fp, "w") o.writelines("\n".join(plines)) o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': option_parser.error(cscore_error_text) bt = load_table(opts.otu_table_fp) pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) samples_to_correlate = [] md_values_to_correlate = [] bt_sample_ids = bt.ids(axis='sample') for sample_id, sample_md in pmf.items(): if sample_id in bt_sample_ids: try: v = is_computable_float(sample_md[opts.category]) samples_to_correlate.append(sample_id) md_values_to_correlate.append(v) except KeyError: option_parser.error('The category (%s)' % opts.category + ' was not found in the mapping file.') except ValueError: pass # value couldn't be converted to float, ignore this sample else: pass # sample in mf, but not bt # remove samples which are not found in the mapping file or do not have # metadata that converts to float bt.filter(ids_to_keep=samples_to_correlate, axis='sample') # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to bt = bt.sort(sort_f=lambda _: samples_to_correlate, axis='sample') if bt.shape[1] <= 3: option_parser.error(filtration_error_text) rhos = [] pvals = [] for feature_vector in bt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) pval = assign_correlation_pval(rho, len(feature_vector), method=opts.pval_assignment_method, permutations=opts.permutations, perm_test_fn=\ bootstrap_functions[opts.test], v1=feature_vector, v2=md_values_to_correlate) rhos.append(rho) pvals.append(pval) fdr_pvals = benjamini_hochberg_step_down(pvals) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.write('\n') o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': option_parser.error(cscore_error_text) bt = load_table(opts.otu_table_fp) pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) samples_to_correlate = [] md_values_to_correlate = [] bt_sample_ids = bt.ids(axis='sample') for sample_id, sample_md in pmf.items(): if sample_id in bt_sample_ids: try: v = is_computable_float(sample_md[opts.category]) samples_to_correlate.append(sample_id) md_values_to_correlate.append(v) except KeyError: option_parser.error('The category (%s)' % opts.category + ' was not found in the mapping file.') except ValueError: pass # value couldn't be converted to float, ignore this sample else: pass # sample in mf, but not bt # remove samples which are not found in the mapping file or do not have # metadata that converts to float bt.filter(ids_to_keep = samples_to_correlate, axis='sample') # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') if bt.shape[1] <= 3: option_parser.error(filtration_error_text) rhos = [] pvals = [] for feature_vector in bt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) pval = assign_correlation_pval(rho, len(feature_vector), method=opts.pval_assignment_method, permutations=opts.permutations, perm_test_fn=\ bootstrap_functions[opts.test], v1=feature_vector, v2=md_values_to_correlate) rhos.append(rho) pvals.append(pval) fdr_pvals = benjamini_hochberg_step_down(pvals) bon_pvals = bonferroni_correction(pvals) # correct for cases where values above 1.0 due to correction fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) o.write('\n') o.close()