def collapse_coupled(self): '''Return an OTU table that is collapsed in 2 ways: duplicate sequences are collapsed together, and samples names are modified, removing r'.1$' and r'.2$'. ''' sample_to_sequence_to_otus = OrderedDict() reg = re.compile(r'.[12]$') for otu in self: new_sample = reg.sub('',otu.sample_name) otu.sample_name = new_sample if new_sample not in sample_to_sequence_to_otus: sample_to_sequence_to_otus[new_sample] = OrderedDict() if otu.sequence not in sample_to_sequence_to_otus[new_sample]: sample_to_sequence_to_otus[new_sample][otu.sequence] = [] sample_to_sequence_to_otus[new_sample][otu.sequence].append(otu) otu_table = OtuTable() for sample, seq_otus in sample_to_sequence_to_otus.items(): for seq, otus in seq_otus.items(): if len(otus) == 1: otu_table.add(otus) else: o = OtuTableEntry() o.marker = otus[0].marker o.sample_name = sample o.sequence = seq o.count = sum([otu.count for otu in otus]) o.coverage = sum([otu.coverage for otu in otus]) o.taxonomy = otus[0].taxonomy #TODO: Make this more of a 'median' taxonomy. otu_table.add([o]) return otu_table
def _appraise_inexactly(self, metagenome_otu_table_collection, found_otu_collection, sequence_identity): '''Given a metagenome sample collection and OTUs 'found' either by binning or assembly, return a AppraisalBuildingBlock representing the OTUs that have been found, using inexact matching. ''' found_otu_table = OtuTable() found_otu_table.add(found_otu_collection) found_collection = OtuTableCollection() found_collection.otu_table_objects = [found_otu_table] sample_to_building_block = {} for uc in SequenceSearcher().global_search(metagenome_otu_table_collection, found_otu_collection, sequence_identity): q = uc.query if q.sample_name in sample_to_building_block: appraisal = sample_to_building_block[q.sample_name] else: appraisal = AppraisalBuildingBlock() sample_to_building_block[q.sample_name] = appraisal if uc.target is not None: appraisal.num_found += q.count appraisal.found_otus.append(q) return sample_to_building_block
def rarefy(self, otu_table_collection, num_to_sample, random_generator=random): '''Return an OtuTable rarefied so that only num_to_sample sequences are present in each sample. Samples not containing sufficient sequences are ignored with a warning. This is not a true rarefaction technique because sequences not chosen in the rarefaction can still influence the output table through the LCA or arbitrary choice operation that has been carried out on the input table. Also, the rarefier operates on counts rather than predicted coverage, skeweing the results toward OTUs that lack inserts. But not by a lot, presumably. otu_table_collection: OtuTableCollection OTU tables iterable num_to_sample: int number of sequences to sample from each ''' sample_to_gene_to_otu = {} to_return = OtuTable() for otu in otu_table_collection: sample_name = otu.sample_name gene = otu.marker if sample_name not in sample_to_gene_to_otu: sample_to_gene_to_otu[sample_name] = {} if gene not in sample_to_gene_to_otu[sample_name]: sample_to_gene_to_otu[sample_name][gene] = {} if otu.sequence in sample_to_gene_to_otu[sample_name][gene]: raise Exception("Found duplicate sequence in OTU table in sample %s, gene %s" % sample_name, gene) sample_to_gene_to_otu[sample_name][gene][otu.sequence] = otu for sample_name in sample_to_gene_to_otu.keys(): for gene in sample_to_gene_to_otu[sample_name].keys(): sequences_to_sample = [] for sequence, otu in sample_to_gene_to_otu[sample_name][gene].items(): for _ in range(otu.count): sequences_to_sample.append(sequence) if len(sequences_to_sample) < num_to_sample: logging.warn("Sample %s gene %s only contains %i sequences, so cannot be rarefied. Ignoring this sample/gene combination" % (sample_name, gene, len(sequences_to_sample))) continue else: sequences_sampled = random_generator.sample(sequences_to_sample, num_to_sample) sequence_counts = {} for seq in sequences_sampled: try: sequence_counts[seq] += 1 except KeyError: sequence_counts[seq] = 1 for seq, count in sequence_counts.items(): otu = sample_to_gene_to_otu[sample_name][gene][seq] e = copy.copy(otu) e.count = count to_return.add([e]) return to_return
def _appraise_inexactly(self, metagenome_otu_table_collection, found_otu_collection, sequence_identity): '''Given a metagenome sample collection and OTUs 'found' either by binning or assembly, return a AppraisalBuildingBlock representing the OTUs that have been found, using inexact matching. ''' found_otu_table = OtuTable() found_otu_table.add(found_otu_collection) found_collection = OtuTableCollection() found_collection.otu_table_objects = [found_otu_table] sample_to_building_block = {} for uc in SequenceSearcher().global_search( metagenome_otu_table_collection, found_otu_collection, sequence_identity): q = uc.query if q.sample_name in sample_to_building_block: appraisal = sample_to_building_block[q.sample_name] else: appraisal = AppraisalBuildingBlock() sample_to_building_block[q.sample_name] = appraisal if uc.target is not None: appraisal.num_found += q.count appraisal.found_otus.append(q) return sample_to_building_block
def print_appraisal(self, appraisal, output_io=sys.stdout, accounted_for_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' output_io.write("\t".join(['sample','num_found','num_not_found','percent_found'])+"\n") founds = [] not_founds = [] def print_sample(num_found, num_not_found, sample, mypercent=None): if mypercent: percent = mypercent elif num_found + num_not_found == 0: percent = 0.0 else: percent = float(num_found)/(num_found+num_not_found) * 100 output_io.write("\t".join([sample, str(num_found), str(num_not_found), "%2.1f" % percent])+"\n") def mean(l): return float(sum(l))/len(l) if len(l) > 0 else float('nan') if accounted_for_otu_table_io: accounted_for_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: print_sample(appraisal_result.num_found, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) founds.append(appraisal_result.num_found) not_founds.append(appraisal_result.num_not_found) if accounted_for_otu_table_io: accounted_for_table.add(appraisal_result.found_otus) if accounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample(sum(founds), sum(not_founds), 'total') means = [] for i, num_found in enumerate(founds): num_not_found = not_founds[i] means.append(float(num_found)/(num_found+num_not_found)) print_sample("%2.1f" % mean(founds), "%2.1f" % mean(not_founds), 'average', mypercent=mean(means)*100) if accounted_for_otu_table_io: accounted_for_table.write_to(accounted_for_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
def print_samples(self, **kwargs): db = SequenceDatabase.acquire(kwargs.pop('db')) sample_names = kwargs.pop('sample_names') taxonomy = kwargs.pop('taxonomy') output_io = kwargs.pop('output_io') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) dbm = self._connect_to_sqlite(db) max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so # query in batches. if sample_names: query_chunks = set(sample_names) else: query_chunks = [taxonomy] otus = OtuTable() total_printed = 0 for chunk in SequenceDatabase.grouper(query_chunks, max_set_size): if sample_names: it = dbm.table('otus').where_in( 'sample_name', [sample for sample in chunk if sample is not None]).get() elif taxonomy: it = dbm.table('otus').where('taxonomy', 'like', "%%%s%%" % taxonomy).get() else: raise Exception("Programming error") for entry in it: otu = OtuTableEntry() otu.marker = entry.marker otu.sample_name = entry.sample_name otu.sequence = entry.sequence otu.count = entry.num_hits otu.coverage = entry.coverage otu.taxonomy = entry.taxonomy otus.add([otu]) total_printed += 1 otus.write_to(output_io) logging.info("Printed %i OTU table entries" % total_printed)
def print_samples(self, **kwargs): db = SequenceDatabase.acquire(kwargs.pop('db')) sample_names = kwargs.pop('sample_names') taxonomy = kwargs.pop('taxonomy') output_io = kwargs.pop('output_io') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) dbm = self._connect_to_sqlite(db) max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so # query in batches. if sample_names: query_chunks = set(sample_names) else: query_chunks = [taxonomy] otus = OtuTable() total_printed = 0 for chunk in SequenceDatabase.grouper(query_chunks, max_set_size): if sample_names: it = dbm.table('otus').where_in( 'sample_name', [sample for sample in chunk if sample is not None]).get() elif taxonomy: it = dbm.table('otus').where( 'taxonomy', 'like', "%%%s%%" % taxonomy).get() else: raise Exception("Programming error") for entry in it: otu = OtuTableEntry() otu.marker = entry.marker otu.sample_name = entry.sample_name otu.sequence = entry.sequence otu.count = entry.num_hits otu.coverage = entry.coverage otu.taxonomy = entry.taxonomy otus.add([otu]) total_printed += 1 otus.write_to(output_io) logging.info("Printed %i OTU table entries" % total_printed)
def print_appraisal(self, appraisal, doing_binning, output_io=sys.stdout, doing_assembly=False, binned_otu_table_io=None, unbinned_otu_table_io=None, assembled_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' headers = ['sample'] if doing_binning: headers.append('num_binned') if doing_assembly: headers.append('num_assembled') headers.append('num_not_found') if doing_binning: headers.append('percent_binned') if doing_assembly: headers.append('percent_assembled') output_io.write("\t".join(headers) + "\n") binned = [] assembled = [] assembled_not_binned = [] not_founds = [] def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample, mypercent_binned=None, mypercent_assembled=None): if mypercent_binned is not None or mypercent_assembled is not None: if doing_binning: percent_binned = mypercent_binned if doing_assembly: percent_assembled = mypercent_assembled else: total = num_not_found if doing_binning: total += num_binned if doing_assembly: total += num_assembled_not_binned if total == 0: if doing_binning: percent_binned = 0.0 if doing_assembly: percent_assembled = 0.0 else: if doing_binning: percent_binned = float(num_binned) / total * 100 if doing_assembly: percent_assembled = float(num_assembled) / total * 100 to_write = [sample] if doing_binning: to_write.append(str(num_binned)) if doing_assembly: to_write.append(str(num_assembled)) to_write.append(str(num_not_found)) if doing_binning: to_write.append("%2.1f" % percent_binned) if doing_assembly: to_write.append("%2.1f" % percent_assembled) output_io.write("\t".join(to_write) + "\n") def mean(l): return float(sum(l)) / len(l) if len(l) > 0 else float('nan') if binned_otu_table_io: binned_table = OtuTable() if unbinned_otu_table_io: unbinned_table = OtuTable() if assembled_otu_table_io: assembled_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: if doing_assembly: num_assembled_not_binned = appraisal_result.num_assembled_not_binned( ) print_sample( appraisal_result.num_binned if doing_binning else None, appraisal_result.num_assembled if doing_assembly else None, num_assembled_not_binned if doing_assembly else None, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) if doing_binning: binned.append(appraisal_result.num_binned) if doing_assembly: assembled.append(appraisal_result.num_assembled) assembled_not_binned.append(num_assembled_not_binned) not_founds.append(appraisal_result.num_not_found) if binned_otu_table_io: binned_table.add(appraisal_result.binned_otus) if unbinned_otu_table_io: unbinned_table.add( appraisal_result.assembled_not_binned_otus()) if assembled_otu_table_io: assembled_table.add(appraisal_result.assembled_otus) if unaccounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample( sum(binned) if doing_binning else None, sum(assembled) if doing_assembly else None, sum(assembled_not_binned) if doing_assembly else None, sum(not_founds), 'total') binned_means = [] assembled_means = [] if doing_binning: to_enumerate = binned else: to_enumerate = assembled for i, _ in enumerate(to_enumerate): num_binned = binned[i] if doing_binning else 0 num_assembled = assembled[i] if doing_assembly else 0 num_assembled_not_binned = assembled_not_binned[ i] if doing_assembly else 0 num_not_found = not_founds[i] total = num_assembled_not_binned + num_not_found if doing_binning: total += num_binned binned_means.append(float(num_binned) / total) if doing_assembly: assembled_means.append(float(num_assembled) / total) print_sample("%2.1f" % mean(binned) if doing_binning else None, "%2.1f" % mean(assembled) if doing_assembly else None, None, "%2.1f" % mean(not_founds), 'average', mypercent_binned=mean(binned_means) * 100 if doing_binning else None, mypercent_assembled=(mean(assembled_means) * 100 if doing_assembly else None)) if binned_otu_table_io: binned_table.write_to(binned_otu_table_io) if unbinned_otu_table_io: unbinned_table.write_to(unbinned_otu_table_io) if assembled_otu_table_io: assembled_table.write_to(assembled_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
def appraise(self, **kwargs): '''Given a collection of OTU tables derived from samples, and OTU table(s) corresponding to a collection of recovered genomes, how much of the community has been recovered in those genomes? Parameters ---------- kwargs: sequence_identity: float for 'near enough', None when an exact match is required. Returns ------- An Appraisal object containing appraisals for each metagenome ''' genome_otu_table_collection = kwargs.pop('genome_otu_table_collection') metagenome_otu_table_collection = kwargs.pop('metagenome_otu_table_collection') sequence_identity = kwargs.pop('sequence_identity', None) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Read in %i markers from the different genomes" %\ len(genome_otu_table_collection)) filtered_genome_otus = \ list(genome_otu_table_collection.excluded_duplicate_distinct_genes()) logging.info("After excluding duplicate markers that may indicate " "contamination, found %i markers" % len(filtered_genome_otus)) if sequence_identity is None: genome_otu_sequences = set() genome_names = set() for otu in filtered_genome_otus: genome_otu_sequences.add(otu.sequence) genome_names.add(otu.sample_name) logging.info("Read in %i unique sequences from the %i reference genomes" %\ (len(genome_otu_sequences), len(genome_names))) # read in metagenome OTU sequences sample_name_to_appraisal = {} for otu in metagenome_otu_table_collection: try: appraisal = sample_name_to_appraisal[otu.sample_name] except KeyError: appraisal = AppraisalResult() appraisal.metagenome_sample_name = otu.sample_name sample_name_to_appraisal[otu.sample_name] = appraisal count = otu.count if otu.sequence in genome_otu_sequences: appraisal.num_found += count appraisal.found_otus.append(otu) else: appraisal.num_not_found += count appraisal.not_found_otus.append(otu) app = Appraisal() app.appraisal_results = sample_name_to_appraisal.values() return app else: sample_name_to_appraisal = {} seen_otus = set() genome_otu_table = OtuTable() genome_otu_table.add(filtered_genome_otus) filtered_collection = OtuTableCollection() filtered_collection.otu_table_objects = [genome_otu_table] for uc in SequenceSearcher().global_search(metagenome_otu_table_collection, filtered_collection, sequence_identity): q = uc.query key = str([q.sample_name, q.sequence]) if key in seen_otus: logging.warn("Double-saw an OTU..") continue else: seen_otus.add(key) if q.sample_name not in sample_name_to_appraisal: res = AppraisalResult() res.metagenome_sample_name = q.sample_name sample_name_to_appraisal[q.sample_name] = res appraisal = sample_name_to_appraisal[q.sample_name] if uc.target is None: appraisal.num_not_found += q.count appraisal.not_found_otus.append(q) else: appraisal.num_found += q.count appraisal.found_otus.append(q) app = Appraisal() app.appraisal_results = sample_name_to_appraisal.values() return app
def print_appraisal(self, appraisal, doing_binning, output_io=sys.stdout, doing_assembly=False, binned_otu_table_io=None, unbinned_otu_table_io=None, assembled_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' headers = ['sample'] if doing_binning: headers.append('num_binned') if doing_assembly: headers.append('num_assembled') headers.append('num_not_found') if doing_binning: headers.append('percent_binned') if doing_assembly: headers.append('percent_assembled') output_io.write("\t".join(headers)+"\n") binned = [] assembled = [] assembled_not_binned = [] not_founds = [] def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample, mypercent_binned=None, mypercent_assembled=None): if mypercent_binned is not None or mypercent_assembled is not None: if doing_binning: percent_binned = mypercent_binned if doing_assembly: percent_assembled = mypercent_assembled else: total = num_not_found if doing_binning: total += num_binned if doing_assembly: total += num_assembled_not_binned if total == 0: if doing_binning: percent_binned = 0.0 if doing_assembly: percent_assembled = 0.0 else: if doing_binning: percent_binned = float(num_binned)/total * 100 if doing_assembly: percent_assembled = float(num_assembled)/total * 100 to_write = [sample] if doing_binning: to_write.append(str(num_binned)) if doing_assembly: to_write.append(str(num_assembled)) to_write.append(str(num_not_found)) if doing_binning: to_write.append("%2.1f" % percent_binned) if doing_assembly: to_write.append("%2.1f" % percent_assembled) output_io.write("\t".join(to_write)+"\n") def mean(l): return float(sum(l))/len(l) if len(l) > 0 else float('nan') if binned_otu_table_io: binned_table = OtuTable() if unbinned_otu_table_io: unbinned_table = OtuTable() if assembled_otu_table_io: assembled_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: if doing_assembly: num_assembled_not_binned = appraisal_result.num_assembled_not_binned() print_sample(appraisal_result.num_binned if doing_binning else None, appraisal_result.num_assembled if doing_assembly else None, num_assembled_not_binned if doing_assembly else None, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) if doing_binning: binned.append(appraisal_result.num_binned) if doing_assembly: assembled.append(appraisal_result.num_assembled) assembled_not_binned.append(num_assembled_not_binned) not_founds.append(appraisal_result.num_not_found) if binned_otu_table_io: binned_table.add(appraisal_result.binned_otus) if unbinned_otu_table_io: unbinned_table.add(appraisal_result.assembled_not_binned_otus()) if assembled_otu_table_io: assembled_table.add(appraisal_result.assembled_otus) if unaccounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample(sum(binned) if doing_binning else None, sum(assembled) if doing_assembly else None, sum(assembled_not_binned) if doing_assembly else None, sum(not_founds), 'total') binned_means = [] assembled_means = [] if doing_binning: to_enumerate = binned else: to_enumerate = assembled for i, _ in enumerate(to_enumerate): num_binned = binned[i] if doing_binning else 0 num_assembled = assembled[i] if doing_assembly else 0 num_assembled_not_binned = assembled_not_binned[i] if doing_assembly else 0 num_not_found = not_founds[i] total = num_assembled_not_binned+num_not_found if doing_binning: total += num_binned binned_means.append(float(num_binned)/total) if doing_assembly: assembled_means.append(float(num_assembled)/total) print_sample("%2.1f" % mean(binned) if doing_binning else None, "%2.1f" % mean(assembled) if doing_assembly else None, None, "%2.1f" % mean(not_founds), 'average', mypercent_binned=mean(binned_means)*100 if doing_binning else None, mypercent_assembled=(mean(assembled_means)*100 if doing_assembly else None)) if binned_otu_table_io: binned_table.write_to(binned_otu_table_io) if unbinned_otu_table_io: unbinned_table.write_to(unbinned_otu_table_io) if assembled_otu_table_io: assembled_table.write_to(assembled_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)