Ejemplo n.º 1
0
    def _get_points_to_estimate(self,
                                reference_individual_count,
                                start=1,
                                stop=None,
                                num_steps=10):
        """Returns depths/sizes to estimate."""
        if stop is None:
            # Compute base sample size as stopping point.
            min_size, max_size, _, _, _ = compute_counts_per_sample_stats(
                self._biom_table)
            stop = int(max(2 * min_size, max_size))

        if start < 1 or num_steps < 1:
            raise ValueError("The minimum individual count and number of "
                             "steps must both be greater than or equal to 1.")

        if start > stop:
            raise ValueError("The minimum individual count must be less than "
                             "or equal to the maximum individual count.")

        step_size = max((stop - start) // num_steps, 1)

        points = range(start, stop + 1, step_size)
        if reference_individual_count not in points:
            insort(points, reference_individual_count)

        return points
Ejemplo n.º 2
0
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp):
    """Counts the number of seqs/OTUs per sample and add its to the mapping file

    Inputs:
        biom_lines:
        mapping_lines:
        otu_counts:
        output_fp:
    """
    # Parse biom file
    biom = parse_biom_table(biom_lines)
    # Parse mapping file
    map_data, headers, comments = parse_mapping_file(mapping_lines)
    # Compute the counts per sample
    min_count, max_count, median_count, mean_count, counts_per_sample =\
        compute_counts_per_sample_stats(biom, binary_counts=otu_counts)
    # Add the counts to the mapping data
    index = len(headers) - 1
    headers.insert(index, "NumIndividuals")
    for row in map_data:
        row.insert(index, str(counts_per_sample[row[0]]))
    # # Add the '#' character to the first header
    # headers[0] = '#' + headers[0]
    # # Add headers to the data
    # map_data.insert(0, headers)
    # Write the corrected mapping file
    write_corrected_mapping(output_fp, headers, comments, map_data)
Ejemplo n.º 3
0
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp):
    """Counts the number of seqs/OTUs per sample and add its to the mapping file

    Inputs:
        biom_lines:
        mapping_lines:
        otu_counts:
        output_fp:
    """
    # Parse biom file
    biom = parse_biom_table(biom_lines)
    # Parse mapping file
    map_data, headers, comments = parse_mapping_file(mapping_lines)
    # Compute the counts per sample
    min_count, max_count, median_count, mean_count, counts_per_sample =\
        compute_counts_per_sample_stats(biom, binary_counts=otu_counts)
    # Add the counts to the mapping data
    index = len(headers) - 1
    headers.insert(index, "NumIndividuals")
    for row in map_data:
        row.insert(index, str(counts_per_sample[row[0]]))
    # Add the '#' character to the first header
    headers[0] = '#' + headers[0]
    # Add headers to the data
    map_data.insert(0, headers)
    # Write the corrected mapping file
    write_corrected_file(map_data, comments, output_fp)
Ejemplo n.º 4
0
 def test_compute_counts_per_sample_stats(self):
     """compute_counts_per_sample_stats functions as expected
     """
     actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax)
     self.assertEqual(actual[0],3)
     self.assertEqual(actual[1],7)
     self.assertEqual(actual[2],4)
     self.assertEqual(actual[3],4.5)
     self.assertEqual(actual[4],{'Sample1':7,'Sample2':3,'Sample3':4,
                                 'Sample4':6,'Sample5':3,'Sample6':4})
Ejemplo n.º 5
0
 def test_compute_counts_per_sample_stats(self):
     """compute_counts_per_sample_stats functions as expected
     """
     actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax)
     self.assertEqual(actual[0], 3)
     self.assertEqual(actual[1], 7)
     self.assertEqual(actual[2], 4)
     self.assertEqual(actual[3], 4.5)
     self.assertEqual(
         actual[4], {"Sample1": 7, "Sample2": 3, "Sample3": 4, "Sample4": 6, "Sample5": 3, "Sample6": 4}
     )
Ejemplo n.º 6
0
 def test_compute_counts_per_sample_stats_obs_counts(self):
     """compute_counts_per_sample_stats functions as expected
     """
     actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax,
                                              binary_counts=True)
     self.assertEqual(actual[0],1)
     self.assertEqual(actual[1],4)
     self.assertEqual(actual[2],2.5)
     self.assertEqual(actual[3],2.5)
     self.assertEqual(actual[4],{'Sample1':2,'Sample2':3,'Sample3':4,
                                 'Sample4':2,'Sample5':1,'Sample6':3})
Ejemplo n.º 7
0
 def test_compute_counts_per_sample_stats_obs_counts(self):
     """compute_counts_per_sample_stats functions as expected
     """
     actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax, binary_counts=True)
     self.assertEqual(actual[0], 1)
     self.assertEqual(actual[1], 4)
     self.assertEqual(actual[2], 2.5)
     self.assertEqual(actual[3], 2.5)
     self.assertEqual(
         actual[4], {"Sample1": 2, "Sample2": 3, "Sample3": 4, "Sample4": 2, "Sample5": 1, "Sample6": 3}
     )
Ejemplo n.º 8
0
    def test_compute_counts_per_sample_stats(self):
        """compute_counts_per_sample_stats functions as expected

        This method is ported from QIIME (http://www.qiime.org). QIIME is a GPL
        project, but we obtained permission from the authors of this method to
        port it to the BIOM Format project (and keep it under BIOM's BSD
        license).
        """
        actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax)
        self.assertEqual(actual[0], 3)
        self.assertEqual(actual[1], 7)
        self.assertEqual(actual[2], 4)
        self.assertEqual(actual[3], 4.5)
        self.assertEqual(actual[4], {'Sample1': 7, 'Sample2': 3, 'Sample3': 4,
                                     'Sample4': 6, 'Sample5': 3, 'Sample6': 4})
Ejemplo n.º 9
0
    def test_compute_counts_per_sample_stats(self):
        """compute_counts_per_sample_stats functions as expected

        This method is ported from QIIME (http://www.qiime.org). QIIME is a GPL
        project, but we obtained permission from the authors of this method to
        port it to the BIOM Format project (and keep it under BIOM's BSD
        license).
        """
        actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax)
        self.assertEqual(actual[0], 3)
        self.assertEqual(actual[1], 7)
        self.assertEqual(actual[2], 4)
        self.assertEqual(actual[3], 4.5)
        self.assertEqual(actual[4], {'Sample1': 7, 'Sample2': 3, 'Sample3': 4,
                                     'Sample4': 6, 'Sample5': 3, 'Sample6': 4})
Ejemplo n.º 10
0
    def test_compute_counts_per_sample_stats_obs_counts(self):
        """compute_counts_per_sample_stats functions as expected

        This method is ported from QIIME (http://www.qiime.org). QIIME is a GPL
        project, but we obtained permission from the authors of this method to
        port it to the BIOM Format project (and keep it under BIOM's BSD
        license).
        """
        actual = compute_counts_per_sample_stats(self.biom_otu_table1_w_tax, binary_counts=True)
        self.assertEqual(actual[0], 1)
        self.assertEqual(actual[1], 4)
        self.assertEqual(actual[2], 2.5)
        self.assertEqual(actual[3], 2.5)
        self.assertEqual(
            actual[4], {"Sample1": 2, "Sample2": 3, "Sample3": 4, "Sample4": 2, "Sample5": 1, "Sample6": 3}
        )
Ejemplo n.º 11
0
    def _get_points_to_estimate(self, reference_individual_count, start=1,
                                stop=None, num_steps=10):
        """Returns depths/sizes to estimate."""
        if stop is None:
            # Compute base sample size as stopping point.
            min_size, max_size, _, _, _ = compute_counts_per_sample_stats(
                    self._biom_table)
            stop = int(max(2 * min_size, max_size))

        if start < 1 or num_steps < 1:
            raise ValueError("The minimum individual count and number of "
                             "steps must both be greater than or equal to 1.")

        if start > stop:
            raise ValueError("The minimum individual count must be less than "
                             "or equal to the maximum individual count.")

        step_size = max((stop - start) // num_steps, 1)

        points = range(start, stop + 1, step_size)
        if reference_individual_count not in points:
            insort(points, reference_individual_count)

        return points
Ejemplo n.º 12
0
    def run(self, **kwargs):
        """
         table: two-element tuple containing the biom table to summarize and
                the file(-like) object containing the original table data. The
                second element of the tuple (the file(-like) object) may be
                None. If this is the case, the MD5 sum will *not* be computed
         qualitative: counts are presented as number of unique observation
                      ids per sample, rather than total observation count per
                      sample
         suppress_md5: if ``True``, the MD5 sum of the table file contents will
                       not be computed. This parameter is ignored if
                       ``table[1] is None``
        """
        result = {}
        qualitative = kwargs['qualitative']
        table, table_lines = kwargs['table']
        
        min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
         compute_counts_per_sample_stats(table, qualitative)
        num_observations = len(table.ObservationIds)
        
        suppress_md5 = (table_lines is None) or kwargs['suppress_md5']
    
        counts_per_sample_values = counts_per_sample.values()
    
        if table.SampleMetadata is None:
            sample_md_keys = ["None provided"]
        else:
            sample_md_keys = table.SampleMetadata[0].keys()
        
        if table.ObservationMetadata is None:
            observation_md_keys = ["None provided"]
        else:
            observation_md_keys = table.ObservationMetadata[0].keys()
    
        lines = []
    
        num_samples = len(counts_per_sample)
        lines.append('Num samples: %d' % num_samples)
        lines.append('Num observations: %d' % num_observations)
        if not qualitative:
            total_count = sum(counts_per_sample_values)
            lines.append('Total count: %d' % total_count)
            lines.append('Table density (fraction of non-zero values): %1.3f' % \
                  table.getTableDensity())
        if not suppress_md5:
            lines.append('Table md5 (unzipped): %s' % safe_md5(table_lines))
        lines.append('')

        if qualitative:
            lines.append('Observations/sample summary:')
        else:
            lines.append('Counts/sample summary:')
        lines.append(' Min: %r' % min_counts)
        lines.append(' Max: %r' % max_counts)
        lines.append(' Median: %1.3f' % median_counts)
        lines.append(' Mean: %1.3f' % mean_counts)
        lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values))
        lines.append(' Sample Metadata Categories: %s' % '; '.join(sample_md_keys))
        lines.append(' Observation Metadata Categories: %s' % '; '.join(observation_md_keys))
     
        lines.append('')
        if qualitative:
            lines.append('Observations/sample detail:')
        else:
            lines.append('Counts/sample detail:')
        
        sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()]
        sorted_counts_per_sample.sort()
        for v,k in sorted_counts_per_sample:
            lines.append(' %s: %r' % (k,v))
        
        result['biom-summary'] = lines
        return result
Ejemplo n.º 13
0
    def run(self, **kwargs):
        result = {}
        qualitative = kwargs['qualitative']
        by_observations = kwargs['observations']
        table, table_lines = kwargs['table']

        if by_observations:
            table = table.transpose()

        min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\
            compute_counts_per_sample_stats(table, qualitative)
        num_observations = len(table.ids(axis='observation'))

        counts_per_sample_values = counts_per_samp.values()

        if table.metadata() is None:
            sample_md_keys = ["None provided"]
        else:
            sample_md_keys = table.metadata()[0].keys()

        if table.metadata(axis='observation') is None:
            observation_md_keys = ["None provided"]
        else:
            observation_md_keys = table.metadata(axis='observation')[0].keys()

        lines = []

        num_samples = len(table.ids())

        if by_observations:
            # as this is a transpose of the original table...
            lines.append('Num samples: %d' % num_observations)
            lines.append('Num observations: %d' % num_samples)
        else:
            lines.append('Num samples: %d' % num_samples)
            lines.append('Num observations: %d' % num_observations)

        if not qualitative:
            total_count = sum(counts_per_sample_values)
            lines.append('Total count: %d' % total_count)
            lines.append('Table density (fraction of non-zero values): %1.3f' %
                         table.get_table_density())

        lines.append('')

        if qualitative:
            if by_observations:
                lines.append('Sample/observations summary:')
            else:
                lines.append('Observations/sample summary:')
        else:
            lines.append('Counts/sample summary:')

        lines.append(' Min: %r' % min_counts)
        lines.append(' Max: %r' % max_counts)
        lines.append(' Median: %1.3f' % median_counts)
        lines.append(' Mean: %1.3f' % mean_counts)
        lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values))

        if by_observations:
            # since this is a transpose...
            lines.append(' Sample Metadata Categories: %s' %
                         '; '.join(observation_md_keys))
            lines.append(' Observation Metadata Categories: %s' %
                         '; '.join(sample_md_keys))
            lines.append('')
        else:
            lines.append(' Sample Metadata Categories: %s' %
                         '; '.join(sample_md_keys))
            lines.append(' Observation Metadata Categories: %s' %
                         '; '.join(observation_md_keys))
            lines.append('')

        if qualitative:
            lines.append('Observations/sample detail:')
        else:
            lines.append('Counts/sample detail:')

        for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)):
            lines.append(' %s: %r' % (k, v))

        result['biom_summary'] = lines
        return result
Ejemplo n.º 14
0
def _summarize_table(table, qualitative=False, observations=False):
    lines = []
    locale.setlocale(locale.LC_ALL, '')

    if observations:
        table = table.transpose()

    min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\
        compute_counts_per_sample_stats(table, qualitative)
    num_observations = len(table.ids(axis='observation'))

    counts_per_sample_values = list(counts_per_samp.values())

    if table.metadata() is None:
        sample_md_keys = ["None provided"]
    else:
        sample_md_keys = table.metadata()[0].keys()

    if table.metadata(axis='observation') is None:
        observation_md_keys = ["None provided"]
    else:
        observation_md_keys = table.metadata(axis='observation')[0].keys()

    num_samples = len(table.ids())

    if observations:
        # as this is a transpose of the original table...
        lines.append('Num samples: ' + locale.format('%d', num_observations,
                                                     grouping=True))
        lines.append('Num observations: ' + locale.format('%d', num_samples,
                                                          grouping=True))
    else:
        lines.append('Num samples: ' + locale.format('%d', num_samples,
                                                     grouping=True))
        lines.append('Num observations: ' + locale.format('%d',
                     num_observations, grouping=True))

    if not qualitative:
        total_count = sum(counts_per_sample_values)
        lines.append('Total count: ' + locale.format('%d', total_count,
                                                     grouping=True))
        lines.append('Table density (fraction of non-zero values): %1.3f' %
                     table.get_table_density())

    lines.append('')

    if qualitative:
        if observations:
            lines.append('Sample/observations summary:')
        else:
            lines.append('Observations/sample summary:')
    else:
        lines.append('Counts/sample summary:')

    lines.append(' Min: ' + locale.format('%1.3f', min_counts, grouping=True))
    lines.append(' Max: ' + locale.format('%1.3f', max_counts, grouping=True))
    lines.append(' Median: ' + locale.format('%1.3f', median_counts,
                                             grouping=True))
    lines.append(' Mean: ' + locale.format('%1.3f', mean_counts,
                                           grouping=True))
    lines.append(' Std. dev.: ' + locale.format('%1.3f',
                 std(counts_per_sample_values), grouping=True))

    if observations:
        # since this is a transpose...
        lines.append(
            ' Sample Metadata Categories: %s' %
            '; '.join(observation_md_keys))
        lines.append(
            ' Observation Metadata Categories: %s' %
            '; '.join(sample_md_keys))
        lines.append('')
    else:
        lines.append(
            ' Sample Metadata Categories: %s' %
            '; '.join(sample_md_keys))
        lines.append(
            ' Observation Metadata Categories: %s' %
            '; '.join(observation_md_keys))
        lines.append('')

    if qualitative:
        lines.append('Observations/sample detail:')
    else:
        lines.append('Counts/sample detail:')

    for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)):
        lines.append('%s: ' % k + locale.format('%1.3f', v, grouping=True))

    return "\n".join(lines)
Ejemplo n.º 15
0
    def run(self, **kwargs):
        result = {}
        qualitative = kwargs['qualitative']
        by_observations = kwargs['observations']
        table, table_lines = kwargs['table']

        if by_observations:
            table = table.transpose()

        min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\
            compute_counts_per_sample_stats(table, qualitative)
        num_observations = len(table.ids(axis='observation'))

        counts_per_sample_values = counts_per_samp.values()

        if table.metadata() is None:
            sample_md_keys = ["None provided"]
        else:
            sample_md_keys = table.metadata()[0].keys()

        if table.metadata(axis='observation') is None:
            observation_md_keys = ["None provided"]
        else:
            observation_md_keys = table.metadata(axis='observation')[0].keys()

        lines = []

        num_samples = len(table.ids())

        if by_observations:
            # as this is a transpose of the original table...
            lines.append('Num samples: %d' % num_observations)
            lines.append('Num observations: %d' % num_samples)
        else:
            lines.append('Num samples: %d' % num_samples)
            lines.append('Num observations: %d' % num_observations)

        if not qualitative:
            total_count = sum(counts_per_sample_values)
            lines.append('Total count: %d' % total_count)
            lines.append('Table density (fraction of non-zero values): %1.3f' %
                         table.get_table_density())

        lines.append('')

        if qualitative:
            if by_observations:
                lines.append('Sample/observations summary:')
            else:
                lines.append('Observations/sample summary:')
        else:
            lines.append('Counts/sample summary:')

        lines.append(' Min: %r' % min_counts)
        lines.append(' Max: %r' % max_counts)
        lines.append(' Median: %1.3f' % median_counts)
        lines.append(' Mean: %1.3f' % mean_counts)
        lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values))

        if by_observations:
            # since this is a transpose...
            lines.append(
                ' Sample Metadata Categories: %s' %
                '; '.join(observation_md_keys))
            lines.append(
                ' Observation Metadata Categories: %s' %
                '; '.join(sample_md_keys))
            lines.append('')
        else:
            lines.append(
                ' Sample Metadata Categories: %s' %
                '; '.join(sample_md_keys))
            lines.append(
                ' Observation Metadata Categories: %s' %
                '; '.join(observation_md_keys))
            lines.append('')

        if qualitative:
            lines.append('Observations/sample detail:')
        else:
            lines.append('Counts/sample detail:')

        for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)):
            lines.append(' %s: %r' % (k, v))

        result['biom_summary'] = lines
        return result
Ejemplo n.º 16
0
 def test_compute_counts_per_sample_stats_empty(self):
     t = Table({}, [] ,[])
     res = compute_counts_per_sample_stats(t)
     self.assertEqual(res, (0, 0, 0, 0, {}))
Ejemplo n.º 17
0
 def test_compute_counts_per_sample_stats_empty(self):
     t = Table({}, [] ,[])
     res = compute_counts_per_sample_stats(t)
     self.assertEqual(res, (0, 0, 0, 0, {}))
Ejemplo n.º 18
0
    def run(self, **kwargs):
        result = {}
        qualitative = kwargs['qualitative']
        table, table_lines = kwargs['table']

        min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
            compute_counts_per_sample_stats(table, qualitative)
        num_observations = len(table.observation_ids)

        suppress_md5 = (table_lines is None) or kwargs['suppress_md5']

        counts_per_sample_values = counts_per_sample.values()

        if table.sample_metadata is None:
            sample_md_keys = ["None provided"]
        else:
            sample_md_keys = table.sample_metadata[0].keys()

        if table.observation_metadata is None:
            observation_md_keys = ["None provided"]
        else:
            observation_md_keys = table.observation_metadata[0].keys()

        lines = []

        num_samples = len(table.sample_ids)
        lines.append('Num samples: %d' % num_samples)
        lines.append('Num observations: %d' % num_observations)

        if not qualitative:
            total_count = sum(counts_per_sample_values)
            lines.append('Total count: %d' % total_count)
            lines.append('Table density (fraction of non-zero values): %1.3f' %
                         table.get_table_density())

        if not suppress_md5:
            lines.append('Table md5 (unzipped): %s' % safe_md5(table_lines))
        lines.append('')

        if qualitative:
            lines.append('Observations/sample summary:')
        else:
            lines.append('Counts/sample summary:')

        lines.append(' Min: %r' % min_counts)
        lines.append(' Max: %r' % max_counts)
        lines.append(' Median: %1.3f' % median_counts)
        lines.append(' Mean: %1.3f' % mean_counts)
        lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values))
        lines.append(
            ' Sample Metadata Categories: %s' %
            '; '.join(sample_md_keys))
        lines.append(
            ' Observation Metadata Categories: %s' %
            '; '.join(observation_md_keys))
        lines.append('')

        if qualitative:
            lines.append('Observations/sample detail:')
        else:
            lines.append('Counts/sample detail:')

        for k, v in sorted(counts_per_sample.items(), key=itemgetter(1)):
            lines.append(' %s: %r' % (k, v))

        result['biom_summary'] = lines
        return result
Ejemplo n.º 19
0
def run_alpha_rarefaction(otu_table_fp,
                          mapping_fp,
                          output_dir,
                          command_handler,
                          params,
                          qiime_config,
                          tree_fp=None,
                          num_steps=10,
                          parallel=False,
                          logger=None,
                          min_rare_depth=10,
                          max_rare_depth=None,
                          suppress_md5=False,
                          status_update_callback=print_to_stdout,
                          plot_stderr_and_stddev=False,
                          retain_intermediate_files=True):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          1) Generate rarefied OTU tables;
          2) Compute alpha diversity metrics for each rarefied OTU table;
          3) Collate alpha diversity results;
          4) Generate alpha rarefaction plots.

    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    if max_rare_depth is None:
        min_count, max_count, median_count, mean_count, counts_per_sample =\
            compute_counts_per_sample_stats(
                load_table(otu_table_fp))
        max_rare_depth = median_count
    step = int((max_rare_depth - min_rare_depth) / num_steps) or 1
    max_rare_depth = int(max_rare_depth)

    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions'])
    except KeyError:
        params_str = ''
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])
        # Build the rarefaction command
        rarefaction_cmd = \
            'parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\
            (otu_table_fp, min_rare_depth, max_rare_depth, step,
             rarefaction_dir, params_str)
    else:
        # Build the rarefaction command
        rarefaction_cmd = \
            'multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\
            (otu_table_fp, min_rare_depth, max_rare_depth, step,
             rarefaction_dir, params_str)
    commands.append([('Alpha rarefaction', rarefaction_cmd)])

    # Prep the alpha diversity command
    alpha_diversity_dir = '%s/alpha_div/' % output_dir
    create_dir(alpha_diversity_dir)
    try:
        params_str = get_params_str(params['alpha_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str += ' -t %s' % tree_fp
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])
        # Build the alpha diversity command
        alpha_diversity_cmd = \
            "parallel_alpha_diversity.py -T -i %s -o %s %s" %\
            (rarefaction_dir, alpha_diversity_dir, params_str)
    else:
        # Build the alpha diversity command
        alpha_diversity_cmd = \
            "alpha_diversity.py -i %s -o %s %s" %\
            (rarefaction_dir, alpha_diversity_dir, params_str)

    commands.append([('Alpha diversity on rarefied OTU tables',
                      alpha_diversity_cmd)])

    # Prep the alpha diversity collation command
    alpha_collated_dir = '%s/alpha_div_collated/' % output_dir
    create_dir(alpha_collated_dir)
    try:
        params_str = get_params_str(params['collate_alpha'])
    except KeyError:
        params_str = ''
    # Build the alpha diversity collation command
    alpha_collated_cmd = 'collate_alpha.py -i %s -o %s %s' %\
        (alpha_diversity_dir, alpha_collated_dir, params_str)
    commands.append([('Collate alpha', alpha_collated_cmd)])

    if not retain_intermediate_files:
        commands.append([
            ('Removing intermediate files',
             'rm -r %s %s' % (rarefaction_dir, alpha_diversity_dir))
        ])
    else:
        commands.append([('Skipping removal of intermediate files.', '')])

    # Prep the make rarefaction plot command(s)
    try:
        params_str = get_params_str(params['make_rarefaction_plots'])
    except KeyError:
        params_str = ''

    if 'std_type' in params[
            'make_rarefaction_plots'] or not plot_stderr_and_stddev:
        rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir
        create_dir(rarefaction_plot_dir)

        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
            'make_rarefaction_plots.py -i %s -m %s -o %s %s' %\
            (alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str)
        commands.append([('Rarefaction plot: %s' % 'All metrics',
                          make_rarefaction_plot_cmd)])
    else:
        rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir
        rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir
        create_dir(rarefaction_plot_dir_stddev)
        create_dir(rarefaction_plot_dir_stderr)

        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
            'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\
            (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev,
             params_str)
        commands.append([('Rarefaction plot: %s' % 'All metrics',
                          make_rarefaction_plot_cmd)])
        make_rarefaction_plot_cmd =\
            'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\
            (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr,
             params_str)
        commands.append([('Rarefaction plot: %s' % 'All metrics',
                          make_rarefaction_plot_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Ejemplo n.º 20
0
def run_alpha_rarefaction(otu_table_fp, 
                          mapping_fp,
                          output_dir,
                          command_handler,
                          params,
                          qiime_config,
                          tree_fp=None,
                          num_steps=10,
                          parallel=False,
                          logger=None,
                          min_rare_depth=10,
                          max_rare_depth=None,
                          suppress_md5=False,
                          status_update_callback=print_to_stdout,
                          plot_stderr_and_stddev=False,
                          retain_intermediate_files=True):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Generate rarefied OTU tables;
          2) Compute alpha diversity metrics for each rarefied OTU table;
          3) Collate alpha diversity results;
          4) Generate alpha rarefaction plots.
    
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
    
    if max_rare_depth == None:
        min_count, max_count, median_count, mean_count, counts_per_sample =\
         compute_counts_per_sample_stats(parse_biom_table(open(otu_table_fp,'U')))
        max_rare_depth = median_count
    step = int((max_rare_depth - min_rare_depth) / num_steps) or 1
    max_rare_depth = int(max_rare_depth)
    
    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions'])
    except KeyError:
        params_str = ''
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])        
        # Build the rarefaction command
        rarefaction_cmd = \
         '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\
         (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth,
          step, rarefaction_dir, params_str)
    else:
        # Build the rarefaction command
        rarefaction_cmd = \
         '%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\
         (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth,
          step, rarefaction_dir, params_str)
    commands.append([('Alpha rarefaction', rarefaction_cmd)])
    
    # Prep the alpha diversity command
    alpha_diversity_dir = '%s/alpha_div/' % output_dir
    create_dir(alpha_diversity_dir)
    try:
        params_str = get_params_str(params['alpha_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str += ' -t %s' % tree_fp
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])   
        # Build the alpha diversity command
        alpha_diversity_cmd = \
         "%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\
         (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir,
          params_str)
    else:  
        # Build the alpha diversity command
        alpha_diversity_cmd = \
         "%s %s/alpha_diversity.py -i %s -o %s %s" %\
         (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir,
          params_str)

    commands.append(\
     [('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)])
     
    # Prep the alpha diversity collation command
    alpha_collated_dir = '%s/alpha_div_collated/' % output_dir
    create_dir(alpha_collated_dir)
    try:
        params_str = get_params_str(params['collate_alpha'])
    except KeyError:
        params_str = ''
    # Build the alpha diversity collation command
    alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, alpha_diversity_dir, \
      alpha_collated_dir, params_str)
    commands.append([('Collate alpha',alpha_collated_cmd)])
    
    if not retain_intermediate_files:
        commands.append([('Removing intermediate files',
                          'rm -r %s %s' % (rarefaction_dir,alpha_diversity_dir))])
    else:
        commands.append([('Skipping removal of intermediate files.','')])

    # Prep the make rarefaction plot command(s)
    try:
        params_str = get_params_str(params['make_rarefaction_plots'])
    except KeyError:
        params_str = ''
    
    if 'std_type' in params['make_rarefaction_plots'] or not plot_stderr_and_stddev:
        rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir
        create_dir(rarefaction_plot_dir)
        
        # Build the make rarefaction plot command(s)
        #for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
    else:
        rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir
        rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir
        create_dir(rarefaction_plot_dir_stddev)
        create_dir(rarefaction_plot_dir_stderr)
        
        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir_stddev, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir_stderr, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
   
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def main():
    opts,args = parser.parse_args()

    if opts.input_fp is None:
        parser.print_help()
        parser.error('Must specify an input file!')
        
    input_fp = opts.input_fp
    output_fp = opts.output_fp
    table = parse_biom_table(biom_open(input_fp,'U'))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
     compute_counts_per_sample_stats(table, opts.num_observations)
    num_observations = len(table.ObservationIds)
    suppress_md5 = opts.suppress_md5
    
    counts_per_sample_values = counts_per_sample.values()
    
    try:
        sample_md_keys = table.SampleMetadata[0].keys()
    except TypeError:
        sample_md_keys = ["None provided"]
    try:
        observation_md_keys = table.ObservationMetadata[0].keys()
    except TypeError:
        observation_md_keys = ["None provided"]
    
    lines = []
    
    num_samples = len(counts_per_sample)
    lines.append('Num samples: %s' % str(num_samples))
    lines.append('Num observations: %s' % str(num_observations))
    if not opts.num_observations:
        total_count = sum(counts_per_sample_values)
        lines.append('Total count: %s' % str(total_count))
        lines.append('Table density (fraction of non-zero values): %1.4f' % \
              table.getTableDensity())
    if not suppress_md5:
        lines.append('Table md5 (unzipped): %s' % safe_md5(biom_open(input_fp,'U')))
    lines.append('')

    if opts.num_observations:
        lines.append('Observations/sample summary:')
    else:
        lines.append('Counts/sample summary:')
    lines.append(' Min: %s' % str(min_counts))
    lines.append(' Max: %s' % str(max_counts))
    lines.append(' Median: %s' % str(median_counts))
    lines.append(' Mean: %s' % str(mean_counts))
    lines.append(' Std. dev.: %s' % (str(std(counts_per_sample_values))))
    lines.append(' Sample Metadata Categories: %s' % '; '.join(sample_md_keys))
    lines.append(' Observation Metadata Categories: %s' % '; '.join(observation_md_keys))
     
    lines.append('')
    if opts.num_observations:
        lines.append('Observations/sample detail:')
    else:
        lines.append('Counts/sample detail:')
    sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    for v,k in sorted_counts_per_sample:
        lines.append(' %s: %s' % (k,str(v)))
    
    if output_fp != None:
        open(output_fp,'w').write('\n'.join(lines))
    else:
        print '\n'.join(lines)