Beispiel #1
0
    def _appraise_inexactly(self, metagenome_otu_table_collection,
                            found_otu_collection, sequence_identity):
        '''Given a metagenome sample collection and OTUs 'found' either by binning or
        assembly, return a AppraisalBuildingBlock representing the OTUs that
        have been found, using inexact matching.

        '''
        found_otu_table = OtuTable()
        found_otu_table.add(found_otu_collection)
        found_collection = OtuTableCollection()
        found_collection.otu_table_objects = [found_otu_table]

        sample_to_building_block = {}

        for uc in SequenceSearcher().global_search(
                metagenome_otu_table_collection, found_otu_collection,
                sequence_identity):
            q = uc.query
            if q.sample_name in sample_to_building_block:
                appraisal = sample_to_building_block[q.sample_name]
            else:
                appraisal = AppraisalBuildingBlock()
                sample_to_building_block[q.sample_name] = appraisal

            if uc.target is not None:
                appraisal.num_found += q.count
                appraisal.found_otus.append(q)

        return sample_to_building_block
Beispiel #2
0
    def _appraise_inexactly(self, metagenome_otu_table_collection,
                            found_otu_collection,
                            sequence_identity):
        '''Given a metagenome sample collection and OTUs 'found' either by binning or
        assembly, return a AppraisalBuildingBlock representing the OTUs that
        have been found, using inexact matching.

        '''
        found_otu_table = OtuTable()
        found_otu_table.add(found_otu_collection)
        found_collection = OtuTableCollection()
        found_collection.otu_table_objects = [found_otu_table]

        sample_to_building_block = {}

        for uc in SequenceSearcher().global_search(metagenome_otu_table_collection,
                                         found_otu_collection,
                                         sequence_identity):
            q = uc.query
            if q.sample_name in sample_to_building_block:
                appraisal = sample_to_building_block[q.sample_name]
            else:
                appraisal = AppraisalBuildingBlock()
                sample_to_building_block[q.sample_name] = appraisal

            if uc.target is not None:
                appraisal.num_found += q.count
                appraisal.found_otus.append(q)

        return sample_to_building_block
Beispiel #3
0
    def rarefy(self, otu_table_collection, num_to_sample, random_generator=random):
        '''Return an OtuTable rarefied so that only num_to_sample sequences
        are present in each sample. Samples not containing sufficient
        sequences are ignored with a warning.
        
        This is not a true rarefaction technique because sequences not
        chosen in the rarefaction can still influence the output table
        through the LCA or arbitrary choice operation that has been
        carried out on the input table.

        Also, the rarefier operates on counts rather than predicted
        coverage, skeweing the results toward OTUs that lack
        inserts. But not by a lot, presumably.
        
        otu_table_collection: OtuTableCollection
            OTU tables iterable
        num_to_sample: int
            number of sequences to sample from each
        '''

        sample_to_gene_to_otu = {}
        to_return = OtuTable()
        for otu in otu_table_collection:
            sample_name = otu.sample_name
            gene = otu.marker
            if sample_name not in sample_to_gene_to_otu:
                sample_to_gene_to_otu[sample_name] = {}
            if gene not in sample_to_gene_to_otu[sample_name]:
                sample_to_gene_to_otu[sample_name][gene] = {}
            if otu.sequence in sample_to_gene_to_otu[sample_name][gene]:
                raise Exception("Found duplicate sequence in OTU table in sample %s, gene %s" % sample_name, gene)
            sample_to_gene_to_otu[sample_name][gene][otu.sequence] = otu

        for sample_name in sample_to_gene_to_otu.keys():
            for gene in sample_to_gene_to_otu[sample_name].keys():
                sequences_to_sample = []
                for sequence, otu in sample_to_gene_to_otu[sample_name][gene].items():
                    for _ in range(otu.count):
                        sequences_to_sample.append(sequence)
                if len(sequences_to_sample) < num_to_sample:
                    logging.warn("Sample %s gene %s only contains %i sequences, so cannot be rarefied. Ignoring this sample/gene combination" % (sample_name, gene, len(sequences_to_sample)))
                    continue
                else:
                    sequences_sampled = random_generator.sample(sequences_to_sample, num_to_sample)
                    sequence_counts = {}
                for seq in sequences_sampled:
                    try:
                        sequence_counts[seq] += 1
                    except KeyError:
                        sequence_counts[seq] = 1

                for seq, count in sequence_counts.items():
                    otu = sample_to_gene_to_otu[sample_name][gene][seq]
                    e = copy.copy(otu)
                    e.count = count
                    to_return.add([e])
        return to_return
                        
                        
                        
Beispiel #4
0
    def collapse_coupled(self):
        '''Return an OTU table that is collapsed in 2 ways: duplicate sequences are
        collapsed together, and samples names are modified, removing r'.1$' and
        r'.2$'.

        '''
        sample_to_sequence_to_otus = OrderedDict()
        reg = re.compile(r'.[12]$')
        for otu in self:
            new_sample = reg.sub('',otu.sample_name)
            otu.sample_name = new_sample
            if new_sample not in sample_to_sequence_to_otus:
                sample_to_sequence_to_otus[new_sample] = OrderedDict()
            if otu.sequence not in sample_to_sequence_to_otus[new_sample]:
                sample_to_sequence_to_otus[new_sample][otu.sequence] = []
            sample_to_sequence_to_otus[new_sample][otu.sequence].append(otu)

        otu_table = OtuTable()
        for sample, seq_otus in sample_to_sequence_to_otus.items():
            for seq, otus in seq_otus.items():
                if len(otus) == 1:
                    otu_table.add(otus)
                else:
                    o = OtuTableEntry()
                    o.marker = otus[0].marker
                    o.sample_name = sample
                    o.sequence = seq
                    o.count = sum([otu.count for otu in otus])
                    o.coverage = sum([otu.coverage for otu in otus])
                    o.taxonomy = otus[0].taxonomy #TODO: Make this more of a 'median' taxonomy.
                    otu_table.add([o])
        return otu_table
Beispiel #5
0
    def write_rarefied_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        number_to_choose = kwargs.pop('number_to_choose', None)
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if number_to_choose is None:
            counts = {}
            for otu in table_collection:
                key = "%s_singlem_RAND8_%s" % (otu.sample_name, otu.marker)
                try:
                    counts[key] += otu.count
                except KeyError:
                    counts[key] = otu.count
            number_to_choose = min(counts.values())
            logging.info(
                "Minimum number of sequences detected is %i, rarefying all sample/gene combinations to this level"
                % number_to_choose)

        logging.info(
            "Rarefying OTU table to max %i sequences per sample/gene combination and writing to %s"
            % (number_to_choose, output_table_io.name))
        OtuTable.write_otus_to(
            Rarefier().rarefy(table_collection, number_to_choose),
            output_table_io)
Beispiel #6
0
    def rarefy(self, otu_table_collection, num_to_sample, random_generator=random):
        '''Return an OtuTable rarefied so that only num_to_sample sequences
        are present in each sample. Samples not containing sufficient
        sequences are ignored with a warning.

        This is not a true rarefaction technique because sequences not
        chosen in the rarefaction can still influence the output table
        through the LCA or arbitrary choice operation that has been
        carried out on the input table.

        Also, the rarefier operates on counts rather than predicted
        coverage, skeweing the results toward OTUs that lack
        inserts. But not by a lot, presumably.

        otu_table_collection: OtuTableCollection
            OTU tables iterable
        num_to_sample: int
            number of sequences to sample from each
        '''

        sample_to_gene_to_otu = {}
        to_return = OtuTable()
        for otu in otu_table_collection:
            sample_name = otu.sample_name
            gene = otu.marker
            if sample_name not in sample_to_gene_to_otu:
                sample_to_gene_to_otu[sample_name] = {}
            if gene not in sample_to_gene_to_otu[sample_name]:
                sample_to_gene_to_otu[sample_name][gene] = {}
            if otu.sequence in sample_to_gene_to_otu[sample_name][gene]:
                raise Exception("Found duplicate sequence in OTU table in sample %s, gene %s" % sample_name, gene)
            sample_to_gene_to_otu[sample_name][gene][otu.sequence] = otu

        for sample_name in sample_to_gene_to_otu.keys():
            for gene in sample_to_gene_to_otu[sample_name].keys():
                sequences_to_sample = []
                for sequence, otu in sample_to_gene_to_otu[sample_name][gene].items():
                    for _ in range(otu.count):
                        sequences_to_sample.append(sequence)
                if len(sequences_to_sample) < num_to_sample:
                    logging.warn("Sample %s gene %s only contains %i sequences, so cannot be rarefied. Ignoring this sample/gene combination" % (sample_name, gene, len(sequences_to_sample)))
                    continue
                else:
                    sequences_sampled = random_generator.sample(sequences_to_sample, num_to_sample)
                    sequence_counts = {}
                for seq in sequences_sampled:
                    try:
                        sequence_counts[seq] += 1
                    except KeyError:
                        sequence_counts[seq] = 1

                for seq, count in sequence_counts.items():
                    otu = sample_to_gene_to_otu[sample_name][gene][seq]
                    e = copy.copy(otu)
                    e.count = count
                    to_return.add([e])
        return to_return
Beispiel #7
0
 def __iter__(self):
     '''Iterate over all the OTUs from all the tables. This can only be done once
     since the data is streamed in.
     '''
     for io in self._archive_table_io_objects:
         for otu in ArchiveOtuTable.read(io):
             yield otu
     for io in self._otu_table_io_objects:
         for otu in OtuTable.each(io):
             yield otu
     for file_path in self._archive_table_file_paths:
         for otu in ArchiveOtuTable.read(open(file_path)):
             yield otu
     for file_path in self._otu_table_file_paths:
         for otu in OtuTable.each(open(file_path)):
             yield otu
Beispiel #8
0
    def write_clustered_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Writing clustered OTU table")
        output_table_io.write(
            "\t".join(
                OtuTable.DEFAULT_OUTPUT_FIELDS+
                ['representative',
                 'total_num_reads',
                 'total_coverage',
                 'num_sub_otus',
                 'max_sub_otu_abundance'])
            +"\n")

        for d in table_collection:
            for otu in d.otus:
                output_table_io.write("\t".join(
                    [OtuTable._to_printable(cell) for cell in [
                        otu.marker,
                        otu.sample_name,
                        otu.sequence,
                        otu.count,
                        otu.coverage,
                        otu.taxonomy,
                        d.sequence,
                        d.count,
                        d.coverage,
                        len(d.otus),
                        max([otu.count for otu in d.otus])
                    ]])+"\n")
Beispiel #9
0
    def write_clustered_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Writing clustered OTU table")
        output_table_io.write(
            "\t".join(
                OtuTable.DEFAULT_OUTPUT_FIELDS+
                ['representative',
                 'total_num_reads',
                 'total_coverage',
                 'num_sub_otus',
                 'max_sub_otu_abundance'])
            +"\n")

        for d in table_collection:
            for otu in d.otus:
                output_table_io.write("\t".join(
                    [OtuTable._to_printable(cell) for cell in [
                        otu.marker,
                        otu.sample_name,
                        otu.sequence,
                        otu.count,
                        otu.coverage,
                        otu.taxonomy,
                        d.sequence,
                        d.count,
                        d.coverage,
                        len(d.otus),
                        max([otu.count for otu in d.otus])
                    ]])+"\n")
 def __iter__(self):
     '''Iterate over all the OTUs from all the tables. This can only be done once
     since the data is streamed in.
     '''
     for io in self._archive_table_io_objects:
         for otu in ArchiveOtuTable.read(io):
             yield otu
     for io in self._otu_table_io_objects:
         for otu in OtuTable.read(io):
             yield otu
     for file_path in self._archive_table_file_paths:
         for otu in ArchiveOtuTable.read(open(file_path)):
             yield otu
     for file_path in self._otu_table_file_paths:
         for otu in OtuTable.each(open(file_path)):
             yield otu
Beispiel #11
0
    def write_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        output_extras = kwargs.pop('output_extras')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if hasattr(output_table_io, 'name'):
            logging.info("Writing %s" % output_table_io.name)
        else:
            logging.info("Writing an OTU table")

        if output_extras:
            OtuTable.write_otus_to(table_collection, output_table_io,
                                   fields_to_print=table_collection.example_field_names())
        else:
            OtuTable.write_otus_to(table_collection, output_table_io)
Beispiel #12
0
    def write_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        output_extras = kwargs.pop('output_extras')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if hasattr(output_table_io, 'name'):
            logging.info("Writing %s" % output_table_io.name)
        else:
            logging.info("Writing an OTU table")

        if output_extras:
            OtuTable.write_otus_to(table_collection, output_table_io,
                                   fields_to_print=table_collection.example_field_names())
        else:
            OtuTable.write_otus_to(table_collection, output_table_io)
Beispiel #13
0
    def write_rarefied_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        number_to_choose = kwargs.pop('number_to_choose', None)
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if number_to_choose is None:
            counts = {}
            for otu in table_collection:
                key = "%s_singlem_RAND8_%s" % (otu.sample_name, otu.marker)
                try:
                    counts[key] += otu.count
                except KeyError:
                    counts[key] = otu.count
            number_to_choose = min(counts.values())
            logging.info("Minimum number of sequences detected is %i, rarefying all sample/gene combinations to this level" % number_to_choose)

        logging.info("Rarefying OTU table to max %i sequences per sample/gene combination and writing to %s" % (number_to_choose, output_table_io.name))
        OtuTable.write_otus_to(Rarefier().rarefy(table_collection, number_to_choose),
                               output_table_io)
Beispiel #14
0
    def print_samples(self, **kwargs):
        db = SequenceDatabase.acquire(kwargs.pop('db'))
        sample_names = kwargs.pop('sample_names')
        taxonomy = kwargs.pop('taxonomy')
        output_io = kwargs.pop('output_io')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        dbm = self._connect_to_sqlite(db)

        max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so
                           # query in batches.
        if sample_names:
            query_chunks = set(sample_names)
        else:
            query_chunks = [taxonomy]
        otus = OtuTable()
        total_printed = 0
        for chunk in SequenceDatabase.grouper(query_chunks, max_set_size):
            if sample_names:
                it = dbm.table('otus').where_in(
                    'sample_name', [sample for sample in chunk if sample is not None]).get()
            elif taxonomy:
                it = dbm.table('otus').where(
                    'taxonomy', 'like', "%%%s%%" % taxonomy).get()
            else:
                raise Exception("Programming error")

            for entry in it:
                otu = OtuTableEntry()
                otu.marker = entry.marker
                otu.sample_name = entry.sample_name
                otu.sequence = entry.sequence
                otu.count = entry.num_hits
                otu.coverage = entry.coverage
                otu.taxonomy = entry.taxonomy
                otus.add([otu])
                total_printed += 1
        otus.write_to(output_io)
        logging.info("Printed %i OTU table entries" % total_printed)
    def collapse_coupled(self):
        '''Return an OTU table that is collapsed in 2 ways: duplicate sequences are
        collapsed together, and samples names are modified, removing r'.1$' and
        r'.2$'.

        '''
        sample_to_sequence_to_otus = OrderedDict()
        reg = re.compile(r'.[12]$')
        for otu in self:
            new_sample = reg.sub('',otu.sample_name)
            otu.sample_name = new_sample
            if new_sample not in sample_to_sequence_to_otus:
                sample_to_sequence_to_otus[new_sample] = OrderedDict()
            if otu.sequence not in sample_to_sequence_to_otus[new_sample]:
                sample_to_sequence_to_otus[new_sample][otu.sequence] = []
            sample_to_sequence_to_otus[new_sample][otu.sequence].append(otu)

        otu_table = OtuTable()
        for sample, seq_otus in sample_to_sequence_to_otus.items():
            for seq, otus in seq_otus.items():
                if len(otus) == 1:
                    otu_table.add(otus)
                else:
                    o = OtuTableEntry()
                    o.marker = otus[0].marker
                    o.sample_name = sample
                    o.sequence = seq
                    o.count = sum([otu.count for otu in otus])
                    o.coverage = sum([otu.coverage for otu in otus])
                    o.taxonomy = otus[0].taxonomy #TODO: Make this more of a 'median' taxonomy.
                    otu_table.add([o])
        return otu_table
    def add_otu_table(self, input_otu_table_io):
        '''Add a regular style OTU table to the collection.

        Parameters
        ----------
        input_otu_table_ios: list of IO
            entries are open streams of OTU table data

        Returns
        -------
        None
        '''
        self.otu_table_objects.append(OtuTable.read(input_otu_table_io))
Beispiel #17
0
    def add_otu_table(self, input_otu_table_io):
        '''Add a regular style OTU table to the collection.

        Parameters
        ----------
        input_otu_table_ios: list of IO
            entries are open streams of OTU table data

        Returns
        -------
        None
        '''
        self.otu_table_objects.append(OtuTable.read(input_otu_table_io))
Beispiel #18
0
    def print_samples(self, **kwargs):
        db = SequenceDatabase.acquire(kwargs.pop('db'))
        sample_names = kwargs.pop('sample_names')
        taxonomy = kwargs.pop('taxonomy')
        output_io = kwargs.pop('output_io')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        dbm = self._connect_to_sqlite(db)

        max_set_size = 999  # Cannot query sqlite with > 999 '?' entries, so
        # query in batches.
        if sample_names:
            query_chunks = set(sample_names)
        else:
            query_chunks = [taxonomy]
        otus = OtuTable()
        total_printed = 0
        for chunk in SequenceDatabase.grouper(query_chunks, max_set_size):
            if sample_names:
                it = dbm.table('otus').where_in(
                    'sample_name',
                    [sample for sample in chunk if sample is not None]).get()
            elif taxonomy:
                it = dbm.table('otus').where('taxonomy', 'like',
                                             "%%%s%%" % taxonomy).get()
            else:
                raise Exception("Programming error")

            for entry in it:
                otu = OtuTableEntry()
                otu.marker = entry.marker
                otu.sample_name = entry.sample_name
                otu.sequence = entry.sequence
                otu.count = entry.num_hits
                otu.coverage = entry.coverage
                otu.taxonomy = entry.taxonomy
                otus.add([otu])
                total_printed += 1
        otus.write_to(output_io)
        logging.info("Printed %i OTU table entries" % total_printed)
Beispiel #19
0
    def run_to_otu_table(self, **kwargs):
        '''Run the pipe, '''
        forward_read_files = kwargs.pop('sequences')
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method

        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        if not assign_taxonomy:
            singlem_assignment_method = NO_ASSIGNMENT_METHOD

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug(
                    "Shared memory directory not detected, using default temporary directory instead"
                )
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" %
                                 working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception(
                        "Working directory '%s' already exists, not continuing"
                        % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory
        extracted_reads = None

        def return_cleanly():
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return None
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" %
                          len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(
                open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info(
                "Read in %i taxonomies from the GreenGenes format taxonomy file"
                % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(align_result,
                                                       include_inserts,
                                                       known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with GraftM..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD:
            package_to_taxonomy_bihash = {}

        for readset in extracted_reads:
            sample_name = readset.sample_name
            singlem_package = readset.singlem_package
            known_sequences = readset.known_sequences

            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(), sample_name,
                        info.seq, info.count, info.coverage, info.taxonomy,
                        info.names, info.aligned_lengths, known_tax
                    ]
                    otu_table_object.data.append(to_print)

            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences, NO_ASSIGNMENT_METHOD, known_taxes,
                known_sequence_taxonomy, None)
            add_info(known_infos, otu_table_object, True)

            if len(
                    readset.unknown_sequences
            ) > 0:  # if any sequences were aligned (not just already known)
                tmpbase = readset.tmpfile_basename

                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = list(
                        itertools.chain(readset.unknown_sequences,
                                        readset.known_sequences))

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                        taxonomies = DiamondResultParser(tax_file)
                    elif singlem_assignment_method == DIAMOND_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                        if not os.path.isfile(tax_file):
                            logging.warn(
                                "Unable to find tax file for gene %s from sample %s "
                                "(likely do to min length filtering), skipping"
                                % (os.path.basename(
                                    singlem_package.base_directory()),
                                   sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)

                    elif singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD:
                        bihash_key = singlem_package.base_directory()
                        if bihash_key in package_to_taxonomy_bihash:
                            taxonomy_bihash = package_to_taxonomy_bihash[
                                bihash_key]
                        else:
                            taxtastic_taxonomy = singlem_package.graftm_package(
                            ).taxtastic_taxonomy_path()
                            logging.debug(
                                "Reading taxtastic taxonomy from %s" %
                                taxtastic_taxonomy)
                            with open(taxtastic_taxonomy) as f:
                                taxonomy_bihash = TaxonomyBihash.parse_taxtastic_taxonomy(
                                    f)
                            package_to_taxonomy_bihash[
                                bihash_key] = taxonomy_bihash
                        base_dir = assignment_result._base_dir(
                            sample_name, singlem_package, tmpbase)
                        jplace_file = os.path.join(base_dir,
                                                   "placements.jplace")
                        logging.debug(
                            "Attempting to read jplace output from %s" %
                            jplace_file)
                        if os.path.exists(jplace_file):
                            with open(jplace_file) as f:
                                jplace_json = json.loads(f.read())
                            placement_parser = PlacementParser(
                                jplace_json, taxonomy_bihash, 0.5)
                        else:
                            # Sometimes alignments are filtered out.
                            placement_parser = None
                        taxonomies = {}
                    elif singlem_assignment_method == NO_ASSIGNMENT_METHOD:
                        taxonomies = {}
                    else:
                        raise Exception("Programming error")

                else:  # Taxonomy has not been assigned.
                    aligned_seqs = readset.unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    is_known_taxonomy = True

                new_infos = list(
                    self._seqs_to_counts_and_taxonomy(
                        aligned_seqs, singlem_assignment_method,
                        known_sequence_tax if known_sequence_taxonomy else {},
                        taxonomies,
                        placement_parser if singlem_assignment_method
                        == PPLACER_ASSIGNMENT_METHOD else None))
                add_info(new_infos, otu_table_object, is_known_taxonomy)

                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir,
                                                     "placements.jplace")
                    output_jplace_file = "%s_%s_%s.jplace" % (
                        output_jplace, sample_name,
                        singlem_package.graftm_package_basename())
                    logging.info("Writing jplace file '%s'" %
                                 output_jplace_file)
                    logging.debug(
                        "Converting jplace file %s to singlem jplace file %s" %
                        (input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(open(input_jplace_file),
                                                      new_infos,
                                                      output_jplace_io)
        return_cleanly()
        return otu_table_object
Beispiel #20
0
    def print_appraisal(self, appraisal,
                        doing_binning,
                        output_io=sys.stdout,
                        doing_assembly=False,
                        binned_otu_table_io=None,
                        unbinned_otu_table_io=None,
                        assembled_otu_table_io=None,
                        unaccounted_for_otu_table_io=None):
        '''print the Appraisal object overview to STDOUT'''

        headers = ['sample']
        if doing_binning: headers.append('num_binned')
        if doing_assembly: headers.append('num_assembled')
        headers.append('num_not_found')
        if doing_binning: headers.append('percent_binned')
        if doing_assembly: headers.append('percent_assembled')
        output_io.write("\t".join(headers)+"\n")

        binned = []
        assembled = []
        assembled_not_binned = []
        not_founds = []

        def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample,
                         mypercent_binned=None, mypercent_assembled=None):
            if mypercent_binned is not None or mypercent_assembled is not None:
                if doing_binning:
                    percent_binned = mypercent_binned
                if doing_assembly:
                    percent_assembled = mypercent_assembled
            else:
                total = num_not_found
                if doing_binning: total += num_binned
                if doing_assembly: total += num_assembled_not_binned
                if total == 0:
                    if doing_binning: percent_binned = 0.0
                    if doing_assembly: percent_assembled = 0.0
                else:
                    if doing_binning:
                        percent_binned = float(num_binned)/total * 100
                    if doing_assembly:
                        percent_assembled = float(num_assembled)/total * 100
            to_write = [sample]
            if doing_binning: to_write.append(str(num_binned))
            if doing_assembly: to_write.append(str(num_assembled))
            to_write.append(str(num_not_found))
            if doing_binning:
                to_write.append("%2.1f" % percent_binned)
            if doing_assembly:
                to_write.append("%2.1f" % percent_assembled)
            output_io.write("\t".join(to_write)+"\n")

        def mean(l):
            return float(sum(l))/len(l) if len(l) > 0 else float('nan')

        if binned_otu_table_io:
            binned_table = OtuTable()
        if unbinned_otu_table_io:
            unbinned_table = OtuTable()
        if assembled_otu_table_io:
            assembled_table = OtuTable()
        if unaccounted_for_otu_table_io:
            unaccounted_for_table = OtuTable()

        for appraisal_result in appraisal.appraisal_results:
            if doing_assembly:
                num_assembled_not_binned = appraisal_result.num_assembled_not_binned()
            print_sample(appraisal_result.num_binned if doing_binning else None,
                         appraisal_result.num_assembled if doing_assembly else None,
                         num_assembled_not_binned if doing_assembly else None,
                         appraisal_result.num_not_found,
                         appraisal_result.metagenome_sample_name)
            if doing_binning:
                binned.append(appraisal_result.num_binned)
            if doing_assembly:
                assembled.append(appraisal_result.num_assembled)
                assembled_not_binned.append(num_assembled_not_binned)
            not_founds.append(appraisal_result.num_not_found)
            if binned_otu_table_io:
                binned_table.add(appraisal_result.binned_otus)
            if unbinned_otu_table_io:
                unbinned_table.add(appraisal_result.assembled_not_binned_otus())
            if assembled_otu_table_io:
                assembled_table.add(appraisal_result.assembled_otus)
            if unaccounted_for_otu_table_io:
                unaccounted_for_table.add(appraisal_result.not_found_otus)

        print_sample(sum(binned) if doing_binning else None,
                     sum(assembled) if doing_assembly else None,
                     sum(assembled_not_binned) if doing_assembly else None,
                     sum(not_founds),
                     'total')

        binned_means = []
        assembled_means = []
        if doing_binning:
            to_enumerate = binned
        else:
            to_enumerate = assembled
        for i, _ in enumerate(to_enumerate):
            num_binned = binned[i] if doing_binning else 0
            num_assembled = assembled[i] if doing_assembly else 0
            num_assembled_not_binned = assembled_not_binned[i] if doing_assembly else 0
            num_not_found = not_founds[i]
            total = num_assembled_not_binned+num_not_found
            if doing_binning:
                total += num_binned
                binned_means.append(float(num_binned)/total)
            if doing_assembly:
                assembled_means.append(float(num_assembled)/total)
        print_sample("%2.1f" % mean(binned) if doing_binning else None,
                     "%2.1f" % mean(assembled) if doing_assembly else None,
                     None,
                     "%2.1f" % mean(not_founds),
                     'average',
                     mypercent_binned=mean(binned_means)*100 if doing_binning else None,
                     mypercent_assembled=(mean(assembled_means)*100 if doing_assembly else None))

        if binned_otu_table_io:
            binned_table.write_to(binned_otu_table_io)
        if unbinned_otu_table_io:
            unbinned_table.write_to(unbinned_otu_table_io)
        if assembled_otu_table_io:
            assembled_table.write_to(assembled_otu_table_io)
        if unaccounted_for_otu_table_io:
            unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
Beispiel #21
0
 def print_appraisal(self, appraisal,
                     output_io=sys.stdout,
                     accounted_for_otu_table_io=None,
                     unaccounted_for_otu_table_io=None):
     '''print the Appraisal object overview to STDOUT'''
     
     output_io.write("\t".join(['sample','num_found','num_not_found','percent_found'])+"\n")
     founds = []
     not_founds = []
     
     def print_sample(num_found, num_not_found, sample, mypercent=None):
         if mypercent:
             percent = mypercent
         elif num_found + num_not_found == 0:
             percent = 0.0
         else:
             percent = float(num_found)/(num_found+num_not_found) * 100
         output_io.write("\t".join([sample, str(num_found), str(num_not_found), "%2.1f" % percent])+"\n")
         
     def mean(l):
         return float(sum(l))/len(l) if len(l) > 0 else float('nan')
     
     if accounted_for_otu_table_io:
         accounted_for_table = OtuTable()
     if unaccounted_for_otu_table_io:
         unaccounted_for_table = OtuTable()
         
     for appraisal_result in appraisal.appraisal_results:
         print_sample(appraisal_result.num_found,
                      appraisal_result.num_not_found,
                      appraisal_result.metagenome_sample_name)
         founds.append(appraisal_result.num_found)
         not_founds.append(appraisal_result.num_not_found)
         if accounted_for_otu_table_io:
             accounted_for_table.add(appraisal_result.found_otus)
         if accounted_for_otu_table_io:
             unaccounted_for_table.add(appraisal_result.not_found_otus)
         
     print_sample(sum(founds), sum(not_founds), 'total')
     
     means = []
     for i, num_found in enumerate(founds):
         num_not_found = not_founds[i]
         means.append(float(num_found)/(num_found+num_not_found))
     print_sample("%2.1f" % mean(founds), "%2.1f" % mean(not_founds), 'average',
                  mypercent=mean(means)*100)
     
     if accounted_for_otu_table_io:
         accounted_for_table.write_to(accounted_for_otu_table_io)
     if unaccounted_for_otu_table_io:
         unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
Beispiel #22
0
    def appraise(self, **kwargs):
        '''Given a collection of OTU tables derived from samples, and OTU
        table(s) corresponding to a collection of recovered genomes, how
        much of the community has been recovered in those genomes?

        Parameters
        ----------
        kwargs:
            sequence_identity: float for 'near enough', None when an exact match is required.
        
        Returns
        -------
        An Appraisal object containing appraisals for each metagenome
        '''
        genome_otu_table_collection = kwargs.pop('genome_otu_table_collection')
        metagenome_otu_table_collection = kwargs.pop('metagenome_otu_table_collection')
        sequence_identity = kwargs.pop('sequence_identity', None)
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Read in %i markers from the different genomes" %\
                     len(genome_otu_table_collection))
        filtered_genome_otus = \
            list(genome_otu_table_collection.excluded_duplicate_distinct_genes())
        logging.info("After excluding duplicate markers that may indicate "
                     "contamination, found %i markers" % len(filtered_genome_otus))
        
        if sequence_identity is None:
            genome_otu_sequences = set()
            genome_names = set()
            for otu in filtered_genome_otus:
                genome_otu_sequences.add(otu.sequence)
                genome_names.add(otu.sample_name)
            logging.info("Read in %i unique sequences from the %i reference genomes" %\
                         (len(genome_otu_sequences), len(genome_names)))
            
            # read in metagenome OTU sequences
            sample_name_to_appraisal = {}
            for otu in metagenome_otu_table_collection:
                try:
                    appraisal = sample_name_to_appraisal[otu.sample_name]
                except KeyError:
                    appraisal = AppraisalResult()
                    appraisal.metagenome_sample_name = otu.sample_name
                    sample_name_to_appraisal[otu.sample_name] = appraisal
                    
                count = otu.count
                if otu.sequence in genome_otu_sequences:
                    appraisal.num_found += count
                    appraisal.found_otus.append(otu)
                else:
                    appraisal.num_not_found += count
                    appraisal.not_found_otus.append(otu)
                    
            app = Appraisal()
            app.appraisal_results = sample_name_to_appraisal.values()
            return app
        
        else:
            sample_name_to_appraisal = {}
            seen_otus = set()
            genome_otu_table = OtuTable()
            genome_otu_table.add(filtered_genome_otus)
            filtered_collection = OtuTableCollection()
            filtered_collection.otu_table_objects = [genome_otu_table]
            for uc in SequenceSearcher().global_search(metagenome_otu_table_collection,
                                             filtered_collection,
                                             sequence_identity):
                q = uc.query
                key = str([q.sample_name, q.sequence])
                if key in seen_otus:
                    logging.warn("Double-saw an OTU..")
                    continue
                else:
                    seen_otus.add(key)
                if q.sample_name not in sample_name_to_appraisal:
                    res = AppraisalResult()
                    res.metagenome_sample_name = q.sample_name
                    sample_name_to_appraisal[q.sample_name] = res
                    
                appraisal = sample_name_to_appraisal[q.sample_name]
                if uc.target is None:
                    appraisal.num_not_found += q.count
                    appraisal.not_found_otus.append(q)
                else:
                    appraisal.num_found += q.count
                    appraisal.found_otus.append(q)
                    
            app = Appraisal()
            app.appraisal_results = sample_name_to_appraisal.values()
            return app
Beispiel #23
0
    def run(self, **kwargs):
        forward_read_files = kwargs.pop('sequences')
        output_otu_table = kwargs.pop('otu_table', None)
        archive_otu_table = kwargs.pop('archive_otu_table', None)
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        output_extras = kwargs.pop('output_extras')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        window_size = kwargs.pop('window_size')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        
        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method
            
        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug("Shared memory directory not detected, using default temporary directory instead")
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" % working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception("Working directory '%s' already exists, not continuing" % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory

        extracted_reads = None
        def return_cleanly():
            if extracted_reads: extracted_reads.cleanup()
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(
            align_result, include_inserts, known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with graftm..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        regular_output_fields = split('gene sample sequence num_hits coverage taxonomy')
        otu_table_object.fields = regular_output_fields + \
                                  split('read_names nucleotides_aligned taxonomy_by_known?')

        for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads:
            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(),
                        sample_name,
                        info.seq,
                        info.count,
                        info.coverage,
                        info.taxonomy,
                        info.names,
                        info.aligned_lengths,
                        known_tax]
                    otu_table_object.data.append(to_print)
            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences,
                known_taxes,
                False,
                True)
            add_info(known_infos, otu_table_object, True)
            
            if tmp_graft: # if any sequences were aligned (not just already known)
                tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta
                
                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = self._get_windowed_sequences(
                        assignment_result.prealigned_sequence_file(
                            sample_name, singlem_package, tmpbase),
                        assignment_result.nucleotide_hits_file(
                            sample_name, singlem_package, tmpbase),
                        singlem_package,
                        include_inserts)
                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                    else:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                    logging.debug("Reading taxonomy from %s" % tax_file)

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        taxonomies = DiamondResultParser(tax_file)
                        use_first = True
                    else:
                        if not os.path.isfile(tax_file):
                            logging.warn("Unable to find tax file for gene %s from sample %s "
                                         "(likely do to min length filtering), skipping" % (
                                             os.path.basename(singlem_package.base_directory()),
                                             sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)
                        use_first = False
                        
                else: # Taxonomy has not been assigned.
                    aligned_seqs = unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    use_first = False # irrelevant
                    is_known_taxonomy = True
                    
                new_infos = list(self._seqs_to_counts_and_taxonomy(
                    aligned_seqs, taxonomies, use_first, False))
                add_info(new_infos, otu_table_object, is_known_taxonomy)
                
                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir, "placements.jplace")
                    output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % (
                        output_jplace, sample_name, singlem_package.graftm_package_basename()))
                    logging.debug("Converting jplace file %s to singlem jplace file %s" % (
                        input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(
                            open(input_jplace_file), new_infos, output_jplace_io)

                            
        if output_otu_table:
            with open(output_otu_table, 'w') as f:
                if output_extras:
                    otu_table_object.write_to(f, otu_table_object.fields)
                else:
                    otu_table_object.write_to(f, regular_output_fields)
        if archive_otu_table:
            with open(archive_otu_table, 'w') as f:
                otu_table_object.archive(hmms.singlem_packages).write_to(f)
        return_cleanly()
Beispiel #24
0
    def print_appraisal(self,
                        appraisal,
                        doing_binning,
                        output_io=sys.stdout,
                        doing_assembly=False,
                        binned_otu_table_io=None,
                        unbinned_otu_table_io=None,
                        assembled_otu_table_io=None,
                        unaccounted_for_otu_table_io=None):
        '''print the Appraisal object overview to STDOUT'''

        headers = ['sample']
        if doing_binning: headers.append('num_binned')
        if doing_assembly: headers.append('num_assembled')
        headers.append('num_not_found')
        if doing_binning: headers.append('percent_binned')
        if doing_assembly: headers.append('percent_assembled')
        output_io.write("\t".join(headers) + "\n")

        binned = []
        assembled = []
        assembled_not_binned = []
        not_founds = []

        def print_sample(num_binned,
                         num_assembled,
                         num_assembled_not_binned,
                         num_not_found,
                         sample,
                         mypercent_binned=None,
                         mypercent_assembled=None):
            if mypercent_binned is not None or mypercent_assembled is not None:
                if doing_binning:
                    percent_binned = mypercent_binned
                if doing_assembly:
                    percent_assembled = mypercent_assembled
            else:
                total = num_not_found
                if doing_binning: total += num_binned
                if doing_assembly: total += num_assembled_not_binned
                if total == 0:
                    if doing_binning: percent_binned = 0.0
                    if doing_assembly: percent_assembled = 0.0
                else:
                    if doing_binning:
                        percent_binned = float(num_binned) / total * 100
                    if doing_assembly:
                        percent_assembled = float(num_assembled) / total * 100
            to_write = [sample]
            if doing_binning: to_write.append(str(num_binned))
            if doing_assembly: to_write.append(str(num_assembled))
            to_write.append(str(num_not_found))
            if doing_binning:
                to_write.append("%2.1f" % percent_binned)
            if doing_assembly:
                to_write.append("%2.1f" % percent_assembled)
            output_io.write("\t".join(to_write) + "\n")

        def mean(l):
            return float(sum(l)) / len(l) if len(l) > 0 else float('nan')

        if binned_otu_table_io:
            binned_table = OtuTable()
        if unbinned_otu_table_io:
            unbinned_table = OtuTable()
        if assembled_otu_table_io:
            assembled_table = OtuTable()
        if unaccounted_for_otu_table_io:
            unaccounted_for_table = OtuTable()

        for appraisal_result in appraisal.appraisal_results:
            if doing_assembly:
                num_assembled_not_binned = appraisal_result.num_assembled_not_binned(
                )
            print_sample(
                appraisal_result.num_binned if doing_binning else None,
                appraisal_result.num_assembled if doing_assembly else None,
                num_assembled_not_binned if doing_assembly else None,
                appraisal_result.num_not_found,
                appraisal_result.metagenome_sample_name)
            if doing_binning:
                binned.append(appraisal_result.num_binned)
            if doing_assembly:
                assembled.append(appraisal_result.num_assembled)
                assembled_not_binned.append(num_assembled_not_binned)
            not_founds.append(appraisal_result.num_not_found)
            if binned_otu_table_io:
                binned_table.add(appraisal_result.binned_otus)
            if unbinned_otu_table_io:
                unbinned_table.add(
                    appraisal_result.assembled_not_binned_otus())
            if assembled_otu_table_io:
                assembled_table.add(appraisal_result.assembled_otus)
            if unaccounted_for_otu_table_io:
                unaccounted_for_table.add(appraisal_result.not_found_otus)

        print_sample(
            sum(binned) if doing_binning else None,
            sum(assembled) if doing_assembly else None,
            sum(assembled_not_binned) if doing_assembly else None,
            sum(not_founds), 'total')

        binned_means = []
        assembled_means = []
        if doing_binning:
            to_enumerate = binned
        else:
            to_enumerate = assembled
        for i, _ in enumerate(to_enumerate):
            num_binned = binned[i] if doing_binning else 0
            num_assembled = assembled[i] if doing_assembly else 0
            num_assembled_not_binned = assembled_not_binned[
                i] if doing_assembly else 0
            num_not_found = not_founds[i]
            total = num_assembled_not_binned + num_not_found
            if doing_binning:
                total += num_binned
                binned_means.append(float(num_binned) / total)
            if doing_assembly:
                assembled_means.append(float(num_assembled) / total)
        print_sample("%2.1f" % mean(binned) if doing_binning else None,
                     "%2.1f" % mean(assembled) if doing_assembly else None,
                     None,
                     "%2.1f" % mean(not_founds),
                     'average',
                     mypercent_binned=mean(binned_means) *
                     100 if doing_binning else None,
                     mypercent_assembled=(mean(assembled_means) *
                                          100 if doing_assembly else None))

        if binned_otu_table_io:
            binned_table.write_to(binned_otu_table_io)
        if unbinned_otu_table_io:
            unbinned_table.write_to(unbinned_otu_table_io)
        if assembled_otu_table_io:
            assembled_table.write_to(assembled_otu_table_io)
        if unaccounted_for_otu_table_io:
            unaccounted_for_table.write_to(unaccounted_for_otu_table_io)