def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Open the input distance matrices, parse them, find the intersection, and # write the two new distance matrices to the output filepaths. input_dm_fps = opts.input_dms.split(',') output_dm_fps = opts.output_dms.split(',') if len(input_dm_fps) != 2 or len(output_dm_fps) != 2: option_parser.error("You must provide exactly two input and output " "distance matrix filepaths.") labels1, dm1_data = parse_distmat(open(input_dm_fps[0], 'U')) labels2, dm2_data = parse_distmat(open(input_dm_fps[1], 'U')) (dm1_labels, dm1), (dm2_labels, dm2) = make_compatible_distance_matrices( parse_distmat(open(input_dm_fps[0],'U')), parse_distmat(open(input_dm_fps[1],'U'))) assert (dm1_labels == dm2_labels), "The order of sample IDs is not the " +\ "same for the two matrices." output1_f = open(output_dm_fps[0], 'w') output2_f = open(output_dm_fps[1], 'w') output1_f.write(format_distance_matrix(dm1_labels, dm1)) output2_f.write(format_distance_matrix(dm2_labels, dm2)) output1_f.close() output2_f.close()
def assemble_distance_matrix(dm_components): """ assemble distance matrix components into a complete dm string """ print "I get called." data = {} # iterate over compenents for c in dm_components: # create a blank list to store the column ids col_ids = [] # iterate over lines for line in c: # split on tabs remove leading and trailing whitespace fields = line.strip().split() if fields: # if no column ids seen yet, these are them if not col_ids: col_ids = fields # otherwise this is a data row so add it to data else: sid = fields[0] data[sid] = dict(zip(col_ids, fields[1:])) # grab the col/row ids as a list so it's ordered labels = data.keys() # create an empty list to build the dm dm = [] # construct the dm one row at a time for l1 in labels: dm.append([data[l1][l2] for l2 in labels]) # create the dm string and return it dm = format_distance_matrix(labels, dm) return dm
def calc_shared_phylotypes(infile, reference_sample=None): """Calculates number of shared phylotypes for each pair of sample. infile: otu table filehandle reference_sample: if set, will use this sample name to calculate shared OTUs between reference sample, and pair of samples. Useful, e.g. when the reference sample is the Donor in a transplant study """ otu_table = parse_biom_table(infile) if reference_sample: #ref_idx = sample_ids.index(reference_sample) ref_idx = reference_sample num_samples = len(otu_table.SampleIds) result_array = zeros((num_samples, num_samples), dtype=int) for i, samp1_id in enumerate(otu_table.SampleIds): for j, samp2_id in enumerate(otu_table.SampleIds[:i + 1]): if reference_sample: result_array[i, j] = result_array[j, i] = \ _calc_shared_phylotypes_multiple(otu_table, [samp1_id, samp2_id, ref_idx]) else: result_array[i, j] = result_array[j, i] = \ _calc_shared_phylotypes_pairwise(otu_table, samp1_id, samp2_id) return format_distance_matrix(otu_table.SampleIds, result_array) + "\n"
def assemble_distance_matrix(dm_components): """ assemble distance matrix components into a complete dm string """ print "I get called." data = {} # iterate over compenents for c in dm_components: # create a blank list to store the column ids col_ids = [] # iterate over lines for line in c: # split on tabs remove leading and trailing whitespace fields = line.strip().split() if fields: # if no column ids seen yet, these are them if not col_ids: col_ids = fields # otherwise this is a data row so add it to data else: sid = fields[0] data[sid] = dict(zip(col_ids,fields[1:])) # grab the col/row ids as a list so it's ordered labels = data.keys() # create an empty list to build the dm dm = [] # construct the dm one row at a time for l1 in labels: dm.append([data[l1][l2] for l2 in labels]) # create the dm string and return it dm = format_distance_matrix(labels,dm) return dm
def calc_shared_phylotypes(infile, reference_sample=None): """Calculates number of shared phylotypes for each pair of sample. infile: otu table filehandle reference_sample: if set, will use this sample name to calculate shared OTUs between reference sample, and pair of samples. Useful, e.g. when the reference sample is the Donor in a transplant study """ sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile) if reference_sample: ref_idx = sample_ids.index(reference_sample) (n,m) = otu_table.shape result_array = zeros((m,m), dtype=int) for i in range(m): for j in range (i+1): if reference_sample: result_array[i,j] = result_array[j,i] = \ _calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx]) else: result_array[i,j] = result_array[j,i] = \ _calc_shared_phylotypes_pairwise(otu_table, i, j) return format_distance_matrix(sample_ids, result_array)+"\n"
def distance_matrix(input_path, column): """ calculates distance matrix on a single column of a mapping file inputs: input_path (file handler) column (str) """ data, comments = parse_mapping_file_to_dict(input_path) column_data = [] column_headers = [] for i in data: if column not in data[i]: stderr.write("\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n" % (column,data[i].keys())) exit(1) try: column_data.append(float(data[i][column])) except ValueError: stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\ % (column,i,data[i][column])) exit(1) column_headers.append(i) data_row = array(column_data) data_col = reshape(data_row, (1, len(data_row))) dist_mtx = abs(data_row-data_col.T) return format_distance_matrix(column_headers, dist_mtx)
def calc_shared_phylotypes(infile, reference_sample=None): """Calculates number of shared phylotypes for each pair of sample. infile: otu table filehandle reference_sample: if set, will use this sample name to calculate shared OTUs between reference sample, and pair of samples. Useful, e.g. when the reference sample is the Donor in a transplant study """ otu_table = parse_biom_table(infile) if reference_sample: #ref_idx = sample_ids.index(reference_sample) ref_idx = reference_sample num_samples = len(otu_table.SampleIds) result_array = zeros((num_samples, num_samples), dtype=int) for i,samp1_id in enumerate(otu_table.SampleIds): for j,samp2_id in enumerate(otu_table.SampleIds[:i+1]): if reference_sample: result_array[i,j] = result_array[j,i] = \ _calc_shared_phylotypes_multiple(otu_table, [samp1_id, samp2_id, ref_idx]) else: result_array[i,j] = result_array[j,i] = \ _calc_shared_phylotypes_pairwise(otu_table, samp1_id, samp2_id) return format_distance_matrix(otu_table.SampleIds, result_array)+"\n"
def distance_matrix(input_path, column): """ calculates distance matrix on a single column of a mapping file inputs: input_path (file handler) column (str) """ data, comments = parse_mapping_file_to_dict(input_path) column_data = [] column_headers = [] for i in data: if column not in data[i]: stderr.write( "\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n" % (column, data[i].keys())) exit(1) try: column_data.append(float(data[i][column])) except ValueError: stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\ % (column,i,data[i][column])) exit(1) column_headers.append(i) data_row = array(column_data) data_col = reshape(data_row, (1, len(data_row))) dist_mtx = abs(data_row - data_col.T) return format_distance_matrix(column_headers, dist_mtx)
def test_format_distance_matrix(self): """format_distance_matrix should return tab-delimited dist mat""" a = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) labels = [11, 22, 33] res = format_distance_matrix(labels, a) self.assertEqual(res, "\t11\t22\t33\n11\t1\t2\t3\n22\t4\t5\t6\n33\t7\t8\t9") self.assertRaises(ValueError, format_distance_matrix, labels[:2], a)
def test_format_distance_matrix(self): """format_distance_matrix should return tab-delimited dist mat""" a = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) labels = [11, 22, 33] res = format_distance_matrix(labels, a) self.assertEqual( res, '\t11\t22\t33\n11\t1\t2\t3\n22\t4\t5\t6\n33\t7\t8\t9') self.assertRaises(ValueError, format_distance_matrix, labels[:2], a)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) data, comments = parse_mapping_file_to_dict(opts.input_path) column_headers = [] if ',' not in opts.column: column_data = [] column_name = opts.column for i in data: if column_name not in data[i]: raise ValueError( "No column: '%s' in the mapping file. Existing columns are: %s" % (column_name, data[i].keys())) try: column_data.append(float(data[i][opts.column])) except ValueError: raise ValueError( "All the values in the column '%s' must be numeric but '%s' has '%s'" % (column_name, i, data[i][column_name])) column_headers.append(i) dtx_mtx = compute_distance_matrix_from_metadata(column_data) else: latitudes = [] longitudes = [] try: latitude, longitude = opts.column.split(',') except ValueError: raise ValueError( "This script accepts a maximum of 2 colums separated by comma and you passed: %s" % (opts.column)) for i in data: if latitude not in data[i] or longitude not in data[i]: raise ValueError( "One of these columns or both do not exist: '%s' or '%s' in the mapping file. Existing columns are: %s" % (latitude, longitude, data[i].keys())) try: latitudes.append(float(data[i][latitude])) longitudes.append(float(data[i][longitude])) except ValueError: raise ValueError( "All the values in the columnd '%s' & '%s' must be numeric but '%s' has '%s'" % (latitude, longitude, i, data[i][column_name])) column_headers.append(i) dtx_mtx = calculate_dist_vincenty(latitudes, longitudes) dtx_txt = format_distance_matrix(column_headers, dtx_mtx) outfilepath = os.path.join(opts.output_fp) f = open(outfilepath, 'w') f.write(dtx_txt) f.close()
def test_format_distance_matrix_almost_zero_diagonal(self): # only diagonal values should be converted to 0.0 if they are close to # zero. other values in the matrix should not be changed. a = array([[0.00001, 1, 0.0000000000001], [1.0, 0.0000000000001, 3], [0.0000000000001, 3.0, 0.0]]) res = format_distance_matrix(['foo', 'bar', 'baz'], a) self.assertEqual( res, '\tfoo\tbar\tbaz\nfoo\t1e-05\t1.0\t1e-13\nbar\t1.0' '\t0.0\t3.0\nbaz\t1e-13\t3.0\t0.0')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Open the input distance matrix and parse it. Shuffle its labels and write # them and the original data to the output file. labels, dm_data = parse_distmat(open(opts.input_distance_matrix, 'U')) shuffle(labels) output_f = open(opts.output_distance_matrix, 'w') output_f.write(format_distance_matrix(labels, dm_data)) output_f.close()
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999): """ Compute Procrustes M2 and p-values for a set of results result_tables: 2d list of tables to be compared to expected tables, where the data in the inner list is: [dataset_id, reference_database_id, method_id, parameter_combination_id, table_fp] expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal coordinate matrices, for the expected result coordinate matrices taxonomy_level: level to compute results """ ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is ### in order here. *ALMOST refers to changes to parser and variable names since expected ### is a pc matrix here. for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables: ## parse the expected table (unless taxonomy_level is specified, this should be ## collapsed on level 6 taxonomy) try: expected_pc_fp = expected_pc_lookup[dataset_id][reference_id] except KeyError: raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id) ## parse the actual table and collapse it at the specified taxonomic level try: actual_table = parse_biom_table(open(actual_table_fp, "U")) except ValueError: raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level) actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy) ### End code copied directly from compute_prfs. # Next block of code, how do I hate thee? Let me count the ways... # (1) dist_bray_curtis doesn't take a BIOM Table object # (2) pcoa takes a qiime-formatted distance matrix as a list of lines # (3) pcoa return a qiime-formatted pc matrix # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple # times, so we actually *need* those the pcs that get passed in to be # lists of lines dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()])) formatted_dm = format_distance_matrix(actual_table.SampleIds, dm) actual_pc = pcoa(formatted_dm.split("\n")).split("\n") expected_pc = list(open(expected_pc_fp, "U")) ## run Procrustes analysis with monte carlo simulation actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo( expected_pc, actual_pc, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=None, trial_output_dir=None, ) yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
def test_single_file_nj(self): """ single_file_nj should throw no errors""" titles = ["hi", "ho", "yo"] distdata = numpy.array([[0, 0.5, 0.3], [0.5, 0.0, 0.9], [0.3, 0.9, 0.0]]) fname = get_tmp_filename(prefix="nj_", suffix=".txt") f = open(fname, "w") self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fname2 = get_tmp_filename(prefix="nj_", suffix=".txt", result_constructor=str) self._paths_to_clean_up.append(fname2) single_file_nj(fname, fname2) assert os.path.exists(fname2)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) indir = opts.input_dir outdir = opts.output_dir if not os.path.exists(outdir): os.makedirs(outdir) #input file_names = os.listdir(indir) file_names = [fname for fname in file_names if not fname.startswith('.')] distmats = [] headers_list = [] for fname in file_names: f = open(os.path.join(indir,fname), 'U') headers, data = parse_distmat(f) f.close() distmats.append(data) headers_list.append(headers) #calcs headers, means, medians, stdevs = matrix_stats(headers_list, distmats) #output f = open(os.path.join(outdir,'means.txt'), 'w') f.write(format_distance_matrix(headers,means)) f.close() f = open(os.path.join(outdir,'medians.txt'), 'w') f.write(format_distance_matrix(headers,medians)) f.close() f = open(os.path.join(outdir,'stdevs.txt'), 'w') f.write(format_distance_matrix(headers,stdevs)) f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) indir = opts.input_dir outdir = opts.output_dir if not os.path.exists(outdir): os.makedirs(outdir) # input file_names = os.listdir(indir) file_names = [fname for fname in file_names if not fname.startswith('.')] distmats = [] headers_list = [] for fname in file_names: f = open(os.path.join(indir, fname), 'U') headers, data = parse_distmat(f) f.close() distmats.append(data) headers_list.append(headers) # calcs headers, means, medians, stdevs = matrix_stats(headers_list, distmats) # output f = open(os.path.join(outdir, 'means.txt'), 'w') f.write(format_distance_matrix(headers, means)) f.close() f = open(os.path.join(outdir, 'medians.txt'), 'w') f.write(format_distance_matrix(headers, medians)) f.close() f = open(os.path.join(outdir, 'stdevs.txt'), 'w') f.write(format_distance_matrix(headers, stdevs)) f.close()
def test_single_file_upgma(self): """ single_file_upgma should throw no errors""" titles = ['hi','ho'] distdata = numpy.array([[0,.5],[.5,0.]]) fname = get_tmp_filename(prefix='upgma_',suffix='.txt') f = open(fname,'w') self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fname2 = get_tmp_filename(prefix='upgma_',suffix='.txt', result_constructor=str) self._paths_to_clean_up.append(fname2) single_file_upgma(fname,fname2) assert(os.path.exists(fname2))
def test_single_file_nj(self): """ single_file_nj should throw no errors""" titles = ['hi','ho','yo'] distdata = numpy.array([[0,.5,.3],[.5,0.,.9],[.3,.9,0.]]) fname = get_tmp_filename(prefix='nj_',suffix='.txt') f = open(fname,'w') self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fname2 = get_tmp_filename(prefix='nj_',suffix='.txt', result_constructor=str) self._paths_to_clean_up.append(fname2) single_file_nj(fname,fname2) assert(os.path.exists(fname2))
def test_single_file_upgma(self): """ single_file_upgma should throw no errors""" titles = ["hi", "ho"] distdata = numpy.array([[0, 0.5], [0.5, 0.0]]) fd, fname = mkstemp(prefix="upgma_", suffix=".txt") close(fd) f = open(fname, "w") self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fd, fname2 = mkstemp(prefix="upgma_", suffix=".txt") close(fd) self._paths_to_clean_up.append(fname2) single_file_upgma(fname, fname2) assert os.path.exists(fname2)
def test_single_file_nj(self): """ single_file_nj should throw no errors""" titles = ['hi', 'ho', 'yo'] distdata = numpy.array([[0, .5, .3], [.5, 0., .9], [.3, .9, 0.]]) fd, fname = mkstemp(prefix='nj_', suffix='.txt') close(fd) f = open(fname, 'w') self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fd, fname2 = mkstemp(prefix='nj_', suffix='.txt') close(fd) self._paths_to_clean_up.append(fname2) single_file_nj(fname, fname2) assert (os.path.exists(fname2))
def test_single_file_upgma(self): """ single_file_upgma should throw no errors""" titles = ['hi', 'ho'] distdata = numpy.array([[0, .5], [.5, 0.]]) fd, fname = mkstemp(prefix='upgma_', suffix='.txt') close(fd) f = open(fname, 'w') self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fd, fname2 = mkstemp(prefix='upgma_', suffix='.txt') close(fd) self._paths_to_clean_up.append(fname2) single_file_upgma(fname, fname2) assert(os.path.exists(fname2))
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False): """ Remove specified samples from distance matrix dm: (sample_ids, dm_data) tuple, as returned from qiime.parse.parse_distmat; or a file handle that can be passed to qiime.parse.parse_distmat """ try: sample_ids, dm_data = dm except ValueError: # input was provide as a file handle sample_ids, dm_data = parse_distmat(dm) sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard]) temp_dm_data = [] new_dm_data = [] new_sample_ids = [] if negate: def keep_sample(s): return s in sample_lookup else: def keep_sample(s): return s not in sample_lookup for row, sample_id in zip(dm_data, sample_ids): if keep_sample(sample_id): temp_dm_data.append(row) new_sample_ids.append(sample_id) temp_dm_data = array(temp_dm_data).transpose() for col, sample_id in zip(temp_dm_data, sample_ids): if keep_sample(sample_id): new_dm_data.append(col) new_dm_data = array(new_dm_data).transpose() return format_distance_matrix(new_sample_ids, new_dm_data)
def single_object_beta(otu_table, metrics, tr, rowids=None, full_tree=False): """mod of single_file_beta to recieve and return otu obj, tree str uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu_table, doesn't call getSubTree() inputs: otu_table -- a otu_table in the biom format metrics -- metrics (str, comma delimited if more than 1 metric) tr -- a phylonode cogent tree object if needed by the chosen beta diversity metric rowids -- comma seperated string """ if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tr: tree = tr else: tree = None metrics_list = metrics.split(',') for metric in metrics_list: try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree is None: stderr.write( "metric %s requires a tree, but none found\n" % (metric, )) exit(1) except AttributeError: stderr.write( "Could not find metric %s.\n\nKnown metrics are: %s\n" % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids is None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, make_subtree=(not full_tree)) else: dissims = metric_f(otumtx) return (format_distance_matrix(otu_table.SampleIds, dissims).split('\n')) else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn( 'dissimilarity ' + metric_f.__name__ + ' is not parallelized, calculating the whole matrix...' ) row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f( otumtx[[rowidx, i], :], otu_table.ObservationIds, tree, [ otu_table.SampleIds[rowidx], otu_table.SampleIds[i] ], make_subtree=(not full_tree))[0, 1] else: dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree=(not full_tree)) row_dissims.append(dissims) return format_matrix(row_dissims, rowids_list, otu_table.SampleIds)
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric; or list) tree_path (str) output_dir (str) rowids (comma separated str) """ metrics_list = metrics try: metrics_list = metrics_list.split(',') except AttributeError: pass otu_table = parse_biom_table(open(input_path, 'U')) if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tree_path: tree = parse_newick(open(tree_path, 'U'), PhyloNode) else: tree = None input_dir, input_filename = os.path.split(input_path) input_basename, input_ext = os.path.splitext(input_filename) for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + input_basename + '.txt') try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree is None: stderr.write( "metric %s requires a tree, but none found\n" % (metric, )) exit(1) except AttributeError: stderr.write( "Could not find metric %s.\n\nKnown metrics are: %s\n" % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids is None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, make_subtree=(not full_tree)) else: dissims = metric_f(otumtx) f = open(outfilepath, 'w') f.write(format_distance_matrix(otu_table.SampleIds, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn( 'dissimilarity ' + metric_f.__name__ + ' is not parallelized, calculating the whole matrix...' ) row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f( otumtx[[rowidx, i], :], otu_table.ObservationIds, tree, [ otu_table.SampleIds[rowidx], otu_table.SampleIds[i] ], make_subtree=(not full_tree))[0, 1] else: dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree=(not full_tree)) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath, 'w') f.write( format_matrix(row_dissims, rowids_list, otu_table.SampleIds)) f.close()
def formatResult(self, result): """Generate formatted distance matrix. result is (data, sample_names)""" data, sample_names = result return format_distance_matrix(sample_names, data)
def single_object_beta(otu_table, metrics, tr, rowids=None, full_tree=False): """mod of single_file_beta to recieve and return otu obj, tree str uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu_table, doesn't call getSubTree() inputs: otu_table -- a otu_table in the biom format metrics -- metrics (str, comma delimited if more than 1 metric) tr -- a phylonode cogent tree object if needed by the chosen beta diversity metric rowids -- comma seperated string """ if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tr: tree = tr else: tree = None metrics_list = metrics.split(',') for metric in metrics_list: try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, make_subtree = (not full_tree)) else: dissims = metric_f(otumtx) return format_distance_matrix(otu_table.SampleIds, dissims).split('\n') else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn('dissimilarity '+metric_f.__name__+\ ' is not parallelized, calculating the whole matrix...') row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f(otumtx[[rowidx,i],:], otu_table.ObservationIds, tree, [otu_table.SampleIds[rowidx], otu_table.SampleIds[i]], make_subtree = (not full_tree))[0,1] else: dissim = metric_f(otumtx[[rowidx,i],:])[0,1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree = (not full_tree)) row_dissims.append(dissims) return format_matrix(row_dissims,rowids_list,otu_table.SampleIds)
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric; or list) tree_path (str) output_dir (str) rowids (comma separated str) """ metrics_list = metrics try: metrics_list = metrics_list.split(',') except AttributeError: pass otu_table = parse_biom_table(open(input_path,'U')) if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tree_path: tree = parse_newick(open(tree_path, 'U'), PhyloNode) else: tree = None input_dir, input_filename = os.path.split(input_path) input_basename, input_ext = os.path.splitext(input_filename) for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + \ input_basename + '.txt') try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, \ tree, otu_table.SampleIds, make_subtree = (not full_tree)) else: dissims = metric_f(otumtx) f = open(outfilepath,'w') f.write(format_distance_matrix(otu_table.SampleIds, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn('dissimilarity '+metric_f.__name__+\ ' is not parallelized, calculating the whole matrix...') row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f(otumtx[[rowidx,i],:], otu_table.ObservationIds, tree, [otu_table.SampleIds[rowidx], otu_table.SampleIds[i]], make_subtree = (not full_tree))[0,1] else: dissim = metric_f(otumtx[[rowidx,i],:])[0,1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree = (not full_tree)) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath,'w') f.write(format_matrix(row_dissims,rowids_list,otu_table.SampleIds)) f.close()
def test_format_distance_matrix_almost_zero_diagonal(self): # only diagonal values should be converted to 0.0 if they are close to # zero. other values in the matrix should not be changed. a = array([[0.00001, 1, 0.0000000000001], [1.0, 0.0000000000001, 3], [0.0000000000001, 3.0, 0.0]]) res = format_distance_matrix(["foo", "bar", "baz"], a) self.assertEqual(res, "\tfoo\tbar\tbaz\nfoo\t1e-05\t1.0\t1e-13\nbar\t1.0" "\t0.0\t3.0\nbaz\t1e-13\t3.0\t0.0")
def shuffle_dm(dm_f): labels, dm_data = parse_distmat(dm_f) shuffle(labels) return format_distance_matrix(labels, dm_data)
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric) tree_path (str) output_dir (str) rowids (comma separated str) """ f = open(input_path, 'U') samids, otuids, otumtx, lineages = parse_otu_table(f) # otu mtx is otus by samples f.close() tree = None if tree_path: f = open(tree_path, 'U') tree = parse_newick(f, PhyloNode) f.close() if not full_tree: tree = tree.getSubTree(otuids, ignore_missing=True) metrics_list = metrics.split(',') for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + os.path.split(input_path)[1]) try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx.T, otuids, tree, samids) else: dissims = metric_f(otumtx.T) f = open(outfilepath, 'w') f.write(format_distance_matrix(samids, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = samids.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': row_dissims.append(metric_f(otumtx.T)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(samids)): if is_phylogenetic: dissim = metric_f( otumtx.T[[rowidx, i], :], otuids, tree, [samids[rowidx], samids[i]])[0, 1] else: dissim = metric_f(otumtx.T[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx.T, otuids, tree, samids, rowid) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath, 'w') f.write(format_matrix(row_dissims, rowids_list, samids)) f.close()