def _get_points_to_estimate(self, reference_individual_count, start=1, stop=None, num_steps=10): """Returns depths/sizes to estimate.""" if stop is None: # Compute base sample size as stopping point. min_size, max_size, _, _, _ = compute_counts_per_sample_stats( self._biom_table) stop = int(max(2 * min_size, max_size)) if start < 1 or num_steps < 1: raise ValueError("The minimum individual count and number of " "steps must both be greater than or equal to 1.") if start > stop: raise ValueError("The minimum individual count must be less than " "or equal to the maximum individual count.") step_size = max((stop - start) // num_steps, 1) points = range(start, stop + 1, step_size) if reference_individual_count not in points: insort(points, reference_individual_count) return points
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp): """Counts the number of seqs/OTUs per sample and add its to the mapping file Inputs: biom_lines: mapping_lines: otu_counts: output_fp: """ # Parse biom file biom = parse_biom_table(biom_lines) # Parse mapping file map_data, headers, comments = parse_mapping_file(mapping_lines) # Compute the counts per sample min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(biom, binary_counts=otu_counts) # Add the counts to the mapping data index = len(headers) - 1 headers.insert(index, "NumIndividuals") for row in map_data: row.insert(index, str(counts_per_sample[row[0]])) # # Add the '#' character to the first header # headers[0] = '#' + headers[0] # # Add headers to the data # map_data.insert(0, headers) # Write the corrected mapping file write_corrected_mapping(output_fp, headers, comments, map_data)
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp): """Counts the number of seqs/OTUs per sample and add its to the mapping file Inputs: biom_lines: mapping_lines: otu_counts: output_fp: """ # Parse biom file biom = parse_biom_table(biom_lines) # Parse mapping file map_data, headers, comments = parse_mapping_file(mapping_lines) # Compute the counts per sample min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(biom, binary_counts=otu_counts) # Add the counts to the mapping data index = len(headers) - 1 headers.insert(index, "NumIndividuals") for row in map_data: row.insert(index, str(counts_per_sample[row[0]])) # Add the '#' character to the first header headers[0] = '#' + headers[0] # Add headers to the data map_data.insert(0, headers) # Write the corrected mapping file write_corrected_file(map_data, comments, output_fp)
def test_compute_counts_per_sample_stats(self): """compute_counts_per_sample_stats functions as expected """ actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax) self.assertEqual(actual[0],3) self.assertEqual(actual[1],7) self.assertEqual(actual[2],4) self.assertEqual(actual[3],4.5) self.assertEqual(actual[4],{'Sample1':7,'Sample2':3,'Sample3':4, 'Sample4':6,'Sample5':3,'Sample6':4})
def test_compute_counts_per_sample_stats(self): """compute_counts_per_sample_stats functions as expected """ actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax) self.assertEqual(actual[0], 3) self.assertEqual(actual[1], 7) self.assertEqual(actual[2], 4) self.assertEqual(actual[3], 4.5) self.assertEqual( actual[4], {"Sample1": 7, "Sample2": 3, "Sample3": 4, "Sample4": 6, "Sample5": 3, "Sample6": 4} )
def test_compute_counts_per_sample_stats_obs_counts(self): """compute_counts_per_sample_stats functions as expected """ actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax, binary_counts=True) self.assertEqual(actual[0],1) self.assertEqual(actual[1],4) self.assertEqual(actual[2],2.5) self.assertEqual(actual[3],2.5) self.assertEqual(actual[4],{'Sample1':2,'Sample2':3,'Sample3':4, 'Sample4':2,'Sample5':1,'Sample6':3})
def test_compute_counts_per_sample_stats_obs_counts(self): """compute_counts_per_sample_stats functions as expected """ actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax, binary_counts=True) self.assertEqual(actual[0], 1) self.assertEqual(actual[1], 4) self.assertEqual(actual[2], 2.5) self.assertEqual(actual[3], 2.5) self.assertEqual( actual[4], {"Sample1": 2, "Sample2": 3, "Sample3": 4, "Sample4": 2, "Sample5": 1, "Sample6": 3} )
def test_compute_counts_per_sample_stats(self): """compute_counts_per_sample_stats functions as expected This method is ported from QIIME (http://www.qiime.org). QIIME is a GPL project, but we obtained permission from the authors of this method to port it to the BIOM Format project (and keep it under BIOM's BSD license). """ actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax) self.assertEqual(actual[0], 3) self.assertEqual(actual[1], 7) self.assertEqual(actual[2], 4) self.assertEqual(actual[3], 4.5) self.assertEqual(actual[4], {'Sample1': 7, 'Sample2': 3, 'Sample3': 4, 'Sample4': 6, 'Sample5': 3, 'Sample6': 4})
def test_compute_counts_per_sample_stats_obs_counts(self): """compute_counts_per_sample_stats functions as expected This method is ported from QIIME (http://www.qiime.org). QIIME is a GPL project, but we obtained permission from the authors of this method to port it to the BIOM Format project (and keep it under BIOM's BSD license). """ actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax, binary_counts=True) self.assertEqual(actual[0], 1) self.assertEqual(actual[1], 4) self.assertEqual(actual[2], 2.5) self.assertEqual(actual[3], 2.5) self.assertEqual( actual[4], {"Sample1": 2, "Sample2": 3, "Sample3": 4, "Sample4": 2, "Sample5": 1, "Sample6": 3} )
def run(self, **kwargs): """ table: two-element tuple containing the biom table to summarize and the file(-like) object containing the original table data. The second element of the tuple (the file(-like) object) may be None. If this is the case, the MD5 sum will *not* be computed qualitative: counts are presented as number of unique observation ids per sample, rather than total observation count per sample suppress_md5: if ``True``, the MD5 sum of the table file contents will not be computed. This parameter is ignored if ``table[1] is None`` """ result = {} qualitative = kwargs['qualitative'] table, table_lines = kwargs['table'] min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_counts_per_sample_stats(table, qualitative) num_observations = len(table.ObservationIds) suppress_md5 = (table_lines is None) or kwargs['suppress_md5'] counts_per_sample_values = counts_per_sample.values() if table.SampleMetadata is None: sample_md_keys = ["None provided"] else: sample_md_keys = table.SampleMetadata[0].keys() if table.ObservationMetadata is None: observation_md_keys = ["None provided"] else: observation_md_keys = table.ObservationMetadata[0].keys() lines = [] num_samples = len(counts_per_sample) lines.append('Num samples: %d' % num_samples) lines.append('Num observations: %d' % num_observations) if not qualitative: total_count = sum(counts_per_sample_values) lines.append('Total count: %d' % total_count) lines.append('Table density (fraction of non-zero values): %1.3f' % \ table.getTableDensity()) if not suppress_md5: lines.append('Table md5 (unzipped): %s' % safe_md5(table_lines)) lines.append('') if qualitative: lines.append('Observations/sample summary:') else: lines.append('Counts/sample summary:') lines.append(' Min: %r' % min_counts) lines.append(' Max: %r' % max_counts) lines.append(' Median: %1.3f' % median_counts) lines.append(' Mean: %1.3f' % mean_counts) lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values)) lines.append(' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append(' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append('') if qualitative: lines.append('Observations/sample detail:') else: lines.append('Counts/sample detail:') sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()] sorted_counts_per_sample.sort() for v,k in sorted_counts_per_sample: lines.append(' %s: %r' % (k,v)) result['biom-summary'] = lines return result
def run(self, **kwargs): result = {} qualitative = kwargs['qualitative'] by_observations = kwargs['observations'] table, table_lines = kwargs['table'] if by_observations: table = table.transpose() min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\ compute_counts_per_sample_stats(table, qualitative) num_observations = len(table.ids(axis='observation')) counts_per_sample_values = counts_per_samp.values() if table.metadata() is None: sample_md_keys = ["None provided"] else: sample_md_keys = table.metadata()[0].keys() if table.metadata(axis='observation') is None: observation_md_keys = ["None provided"] else: observation_md_keys = table.metadata(axis='observation')[0].keys() lines = [] num_samples = len(table.ids()) if by_observations: # as this is a transpose of the original table... lines.append('Num samples: %d' % num_observations) lines.append('Num observations: %d' % num_samples) else: lines.append('Num samples: %d' % num_samples) lines.append('Num observations: %d' % num_observations) if not qualitative: total_count = sum(counts_per_sample_values) lines.append('Total count: %d' % total_count) lines.append('Table density (fraction of non-zero values): %1.3f' % table.get_table_density()) lines.append('') if qualitative: if by_observations: lines.append('Sample/observations summary:') else: lines.append('Observations/sample summary:') else: lines.append('Counts/sample summary:') lines.append(' Min: %r' % min_counts) lines.append(' Max: %r' % max_counts) lines.append(' Median: %1.3f' % median_counts) lines.append(' Mean: %1.3f' % mean_counts) lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values)) if by_observations: # since this is a transpose... lines.append(' Sample Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append(' Observation Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append('') else: lines.append(' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append(' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append('') if qualitative: lines.append('Observations/sample detail:') else: lines.append('Counts/sample detail:') for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)): lines.append(' %s: %r' % (k, v)) result['biom_summary'] = lines return result
def _summarize_table(table, qualitative=False, observations=False): lines = [] locale.setlocale(locale.LC_ALL, '') if observations: table = table.transpose() min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\ compute_counts_per_sample_stats(table, qualitative) num_observations = len(table.ids(axis='observation')) counts_per_sample_values = list(counts_per_samp.values()) if table.metadata() is None: sample_md_keys = ["None provided"] else: sample_md_keys = table.metadata()[0].keys() if table.metadata(axis='observation') is None: observation_md_keys = ["None provided"] else: observation_md_keys = table.metadata(axis='observation')[0].keys() num_samples = len(table.ids()) if observations: # as this is a transpose of the original table... lines.append('Num samples: ' + locale.format('%d', num_observations, grouping=True)) lines.append('Num observations: ' + locale.format('%d', num_samples, grouping=True)) else: lines.append('Num samples: ' + locale.format('%d', num_samples, grouping=True)) lines.append('Num observations: ' + locale.format('%d', num_observations, grouping=True)) if not qualitative: total_count = sum(counts_per_sample_values) lines.append('Total count: ' + locale.format('%d', total_count, grouping=True)) lines.append('Table density (fraction of non-zero values): %1.3f' % table.get_table_density()) lines.append('') if qualitative: if observations: lines.append('Sample/observations summary:') else: lines.append('Observations/sample summary:') else: lines.append('Counts/sample summary:') lines.append(' Min: ' + locale.format('%1.3f', min_counts, grouping=True)) lines.append(' Max: ' + locale.format('%1.3f', max_counts, grouping=True)) lines.append(' Median: ' + locale.format('%1.3f', median_counts, grouping=True)) lines.append(' Mean: ' + locale.format('%1.3f', mean_counts, grouping=True)) lines.append(' Std. dev.: ' + locale.format('%1.3f', std(counts_per_sample_values), grouping=True)) if observations: # since this is a transpose... lines.append( ' Sample Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append( ' Observation Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append('') else: lines.append( ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append( ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append('') if qualitative: lines.append('Observations/sample detail:') else: lines.append('Counts/sample detail:') for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)): lines.append('%s: ' % k + locale.format('%1.3f', v, grouping=True)) return "\n".join(lines)
def run(self, **kwargs): result = {} qualitative = kwargs['qualitative'] by_observations = kwargs['observations'] table, table_lines = kwargs['table'] if by_observations: table = table.transpose() min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\ compute_counts_per_sample_stats(table, qualitative) num_observations = len(table.ids(axis='observation')) counts_per_sample_values = counts_per_samp.values() if table.metadata() is None: sample_md_keys = ["None provided"] else: sample_md_keys = table.metadata()[0].keys() if table.metadata(axis='observation') is None: observation_md_keys = ["None provided"] else: observation_md_keys = table.metadata(axis='observation')[0].keys() lines = [] num_samples = len(table.ids()) if by_observations: # as this is a transpose of the original table... lines.append('Num samples: %d' % num_observations) lines.append('Num observations: %d' % num_samples) else: lines.append('Num samples: %d' % num_samples) lines.append('Num observations: %d' % num_observations) if not qualitative: total_count = sum(counts_per_sample_values) lines.append('Total count: %d' % total_count) lines.append('Table density (fraction of non-zero values): %1.3f' % table.get_table_density()) lines.append('') if qualitative: if by_observations: lines.append('Sample/observations summary:') else: lines.append('Observations/sample summary:') else: lines.append('Counts/sample summary:') lines.append(' Min: %r' % min_counts) lines.append(' Max: %r' % max_counts) lines.append(' Median: %1.3f' % median_counts) lines.append(' Mean: %1.3f' % mean_counts) lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values)) if by_observations: # since this is a transpose... lines.append( ' Sample Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append( ' Observation Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append('') else: lines.append( ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append( ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append('') if qualitative: lines.append('Observations/sample detail:') else: lines.append('Counts/sample detail:') for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)): lines.append(' %s: %r' % (k, v)) result['biom_summary'] = lines return result
def test_compute_counts_per_sample_stats_empty(self): t = Table({}, [] ,[]) res = compute_counts_per_sample_stats(t) self.assertEqual(res, (0, 0, 0, 0, {}))
def run(self, **kwargs): result = {} qualitative = kwargs['qualitative'] table, table_lines = kwargs['table'] min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_counts_per_sample_stats(table, qualitative) num_observations = len(table.observation_ids) suppress_md5 = (table_lines is None) or kwargs['suppress_md5'] counts_per_sample_values = counts_per_sample.values() if table.sample_metadata is None: sample_md_keys = ["None provided"] else: sample_md_keys = table.sample_metadata[0].keys() if table.observation_metadata is None: observation_md_keys = ["None provided"] else: observation_md_keys = table.observation_metadata[0].keys() lines = [] num_samples = len(table.sample_ids) lines.append('Num samples: %d' % num_samples) lines.append('Num observations: %d' % num_observations) if not qualitative: total_count = sum(counts_per_sample_values) lines.append('Total count: %d' % total_count) lines.append('Table density (fraction of non-zero values): %1.3f' % table.get_table_density()) if not suppress_md5: lines.append('Table md5 (unzipped): %s' % safe_md5(table_lines)) lines.append('') if qualitative: lines.append('Observations/sample summary:') else: lines.append('Counts/sample summary:') lines.append(' Min: %r' % min_counts) lines.append(' Max: %r' % max_counts) lines.append(' Median: %1.3f' % median_counts) lines.append(' Mean: %1.3f' % mean_counts) lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values)) lines.append( ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append( ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append('') if qualitative: lines.append('Observations/sample detail:') else: lines.append('Counts/sample detail:') for k, v in sorted(counts_per_sample.items(), key=itemgetter(1)): lines.append(' %s: %r' % (k, v)) result['biom_summary'] = lines return result
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) if max_rare_depth is None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats( load_table(otu_table_fp)) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ 'parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "alpha_diversity.py -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) commands.append([('Alpha diversity on rarefied OTU tables', alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = 'collate_alpha.py -i %s -o %s %s' %\ (alpha_diversity_dir, alpha_collated_dir, params_str) commands.append([('Collate alpha', alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([ ('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir, alpha_diversity_dir)) ]) else: commands.append([('Skipping removal of intermediate files.', '')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params[ 'make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) if max_rare_depth == None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(parse_biom_table(open(otu_table_fp,'U'))) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ '%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "%s %s/alpha_diversity.py -i %s -o %s %s" %\ (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, params_str) commands.append(\ [('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, alpha_diversity_dir, \ alpha_collated_dir, params_str) commands.append([('Collate alpha',alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir,alpha_diversity_dir))]) else: commands.append([('Skipping removal of intermediate files.','')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params['make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) #for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def main(): opts,args = parser.parse_args() if opts.input_fp is None: parser.print_help() parser.error('Must specify an input file!') input_fp = opts.input_fp output_fp = opts.output_fp table = parse_biom_table(biom_open(input_fp,'U')) min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_counts_per_sample_stats(table, opts.num_observations) num_observations = len(table.ObservationIds) suppress_md5 = opts.suppress_md5 counts_per_sample_values = counts_per_sample.values() try: sample_md_keys = table.SampleMetadata[0].keys() except TypeError: sample_md_keys = ["None provided"] try: observation_md_keys = table.ObservationMetadata[0].keys() except TypeError: observation_md_keys = ["None provided"] lines = [] num_samples = len(counts_per_sample) lines.append('Num samples: %s' % str(num_samples)) lines.append('Num observations: %s' % str(num_observations)) if not opts.num_observations: total_count = sum(counts_per_sample_values) lines.append('Total count: %s' % str(total_count)) lines.append('Table density (fraction of non-zero values): %1.4f' % \ table.getTableDensity()) if not suppress_md5: lines.append('Table md5 (unzipped): %s' % safe_md5(biom_open(input_fp,'U'))) lines.append('') if opts.num_observations: lines.append('Observations/sample summary:') else: lines.append('Counts/sample summary:') lines.append(' Min: %s' % str(min_counts)) lines.append(' Max: %s' % str(max_counts)) lines.append(' Median: %s' % str(median_counts)) lines.append(' Mean: %s' % str(mean_counts)) lines.append(' Std. dev.: %s' % (str(std(counts_per_sample_values)))) lines.append(' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)) lines.append(' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)) lines.append('') if opts.num_observations: lines.append('Observations/sample detail:') else: lines.append('Counts/sample detail:') sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()] sorted_counts_per_sample.sort() for v,k in sorted_counts_per_sample: lines.append(' %s: %s' % (k,str(v))) if output_fp != None: open(output_fp,'w').write('\n'.join(lines)) else: print '\n'.join(lines)