def getConsensusTaxForBlobs(self, taxrule, blast_order): ''' - Based on taxrule ("A" or "B") and the blast_order (list in order in which blast files where specified) it calculates the consensus taxonomy for each blob - if taxrule == A: - it puts all taxonomic groups in a dict with their summed scores as values - if a taxonomic group occurs in hits of more than one BLAST file, the highest score is used - if taxrule == B: - taxonomic groups are put in the dict with their summed scores as values IF they come from the first BLAST file - If there was no hit then take the taxonomic groups from the next one - The highest scoring taxonomic group is selected as consensus taxonomy for each blob ''' for contig_name in self.contigs: dict_for_tax_merging = {} for blast_lib in blast_order: for tax, score in sorted(self.contigs[contig_name].tax[blast_lib].items(), key=lambda x: x[1], reverse=True): # loops through tax/score with decreasing score if taxrule == 'A': if not tax in dict_for_tax_merging: dict_for_tax_merging[tax] = score else: if score > dict_for_tax_merging[tax]: dict_for_tax_merging[tax] = score elif taxrule == 'B': if blast_lib == blast_order[0]: # First blast_lib dict_for_tax_merging[tax] = score else: if len(dict_for_tax_merging) <= 1 and ('no-hit' in dict_for_tax_merging): dict_for_tax_merging[tax] = score tax = keyWithMaxVal(dict_for_tax_merging) self.contigs[contig_name].tax['tax'] = {} self.contigs[contig_name].tax['tax'][tax]=dict_for_tax_merging[tax] self.blast_libs.append('tax')
def getConsensusTaxForBlobs(self, taxrule, blast_order): ''' - Based on taxrule ("A" or "B") and the blast_order (list in order in which blast files where specified) it calculates the consensus taxonomy for each blob - if taxrule == A: - it puts all taxonomic groups in a dict with their summed scores as values - if a taxonomic group occurs in hits of more than one BLAST file, the highest score is used - if taxrule == B: - taxonomic groups are put in the dict with their summed scores as values IF they come from the first BLAST file - If there was no hit then take the taxonomic groups from the next one - The highest scoring taxonomic group is selected as consensus taxonomy for each blob ''' for contig_name in self.contigs: dict_for_tax_merging = {} for blast_lib in blast_order: for tax, score in sorted( self.contigs[contig_name].tax[blast_lib].items(), key=lambda x: x[1], reverse=True): # loops through tax/score with decreasing score if taxrule == 'A': if not tax in dict_for_tax_merging: dict_for_tax_merging[tax] = score else: if score > dict_for_tax_merging[tax]: dict_for_tax_merging[tax] = score elif taxrule == 'B': if blast_lib == blast_order[0]: # First blast_lib dict_for_tax_merging[tax] = score else: if len(dict_for_tax_merging) <= 1 and ( 'no-hit' in dict_for_tax_merging): dict_for_tax_merging[tax] = score tax = keyWithMaxVal(dict_for_tax_merging) self.contigs[contig_name].tax['tax'] = {} self.contigs[contig_name].tax['tax'][tax] = dict_for_tax_merging[ tax] self.blast_libs.append('tax')
def getStats(self): ''' Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... ''' self.stats['count'] = {} self.stats['span'] = {} self.stats['n50'] = {} self.stats['lengths'] = {} self.stats['gc'] = {} self.stats['cov'] = {} self.stats['total_count'] = 0 self.stats['total_span'] = 0 self.stats['total_n50'] = 0 self.stats['total_lengths'] = [] self.stats['total_cov'] = {} self.stats['total_gc'] = {'raw': [], 'mean': 0.0, 'stdev': 0.0} self.stats['cov_libs'] = [] for contig_name in self.contigs: blob = self.contigs[contig_name] self.stats['total_count'] += 1 self.stats['total_span'] += blob.length self.stats['total_lengths'].append(blob.length) self.stats['total_gc']['raw'].append(blob.gc) for blast_lib in self.blast_libs: bestTax = keyWithMaxVal(blob.tax[blast_lib]) if not blast_lib in self.stats['count']: self.stats['count'][blast_lib] = {} self.stats['span'][blast_lib] = {} self.stats['lengths'][blast_lib] = {} self.stats['gc'][blast_lib] = {} self.stats['cov'][blast_lib] = {} self.stats['count'][ blast_lib][bestTax] = self.stats['count'][blast_lib].get( bestTax, 0) + 1 self.stats['span'][ blast_lib][bestTax] = self.stats['span'][blast_lib].get( bestTax, 0) + blob.length if not bestTax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][bestTax] = { 'raw': [], 'mean': 0.0, 'stdev': 0.0 } self.stats['lengths'][blast_lib][bestTax] = [] self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc) self.stats['lengths'][blast_lib][bestTax].append(blob.length) for cov_lib, cov in blob.covs.items(): if not cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib] = { 'raw': [], 'mean': 0.0, 'stdev': 0.0 } self.stats['cov'][blast_lib][cov_lib] = {} if not bestTax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][bestTax] = { 'raw': [], 'mean': 0.0, 'stdev': 0.0 } self.stats['cov'][blast_lib][cov_lib][bestTax][ 'raw'].append(cov) for cov_lib, cov in blob.covs.items(): self.stats['total_cov'][cov_lib]['raw'].append(cov) for blast_lib in self.blast_libs: # calculate N50 for tax, list_of_lengths in self.stats['lengths'][blast_lib].items( ): if not blast_lib in self.stats['n50']: self.stats['n50'][blast_lib] = {} self.stats['n50'][blast_lib][tax] = n50(list_of_lengths) self.stats['total_n50'] = n50(self.stats['total_lengths']) # calculate total gc mean/stdev for tax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format( numpy.mean(self.stats['gc'][blast_lib][tax]['raw'])) self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format( numpy.std(self.stats['gc'][blast_lib][tax]['raw'])) # calculate total cov mean/stdev for cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format( numpy.mean(self.stats['total_cov'][cov_lib]['raw'])) self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format( numpy.std(self.stats['total_cov'][cov_lib]['raw'])) # calculate tax-specific cov mean/stdev for tax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][tax][ 'mean'] = "{0:.2f}".format( numpy.mean(self.stats['cov'][blast_lib][cov_lib] [tax]['raw'])) self.stats['cov'][blast_lib][cov_lib][tax][ 'stdev'] = "{0:.2f}".format( numpy.std(self.stats['cov'][blast_lib][cov_lib] [tax]['raw'])) self.stats['total_gc']['mean'] = "{0:.2f}".format( numpy.mean(self.stats['total_gc']['raw'])) self.stats['total_gc']['stdev'] = "{0:.2f}".format( numpy.std(self.stats['total_gc']['raw']))
def getStats(self): ''' Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... ''' self.stats['count'] = {} self.stats['span']= {} self.stats['n50']= {} self.stats['lengths']= {} self.stats['gc']= {} self.stats['cov'] = {} self.stats['total_count'] = 0 self.stats['total_span'] = 0 self.stats['total_n50'] = 0 self.stats['total_lengths'] = [] self.stats['total_cov'] = {} self.stats['total_gc'] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['cov_libs'] = [] for contig_name in self.contigs: blob = self.contigs[contig_name] self.stats['total_count'] += 1 self.stats['total_span'] += blob.length self.stats['total_lengths'].append(blob.length) self.stats['total_gc']['raw'].append(blob.gc) for blast_lib in self.blast_libs: bestTax = keyWithMaxVal(blob.tax[blast_lib]) if not blast_lib in self.stats['count']: self.stats['count'][blast_lib] = {} self.stats['span'][blast_lib] = {} self.stats['lengths'][blast_lib] = {} self.stats['gc'][blast_lib] = {} self.stats['cov'][blast_lib] = {} self.stats['count'][blast_lib][bestTax] = self.stats['count'][blast_lib].get(bestTax, 0) + 1 self.stats['span'][blast_lib][bestTax] = self.stats['span'][blast_lib].get(bestTax, 0) + blob.length if not bestTax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['lengths'][blast_lib][bestTax] = [] self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc) self.stats['lengths'][blast_lib][bestTax].append(blob.length) for cov_lib, cov in blob.covs.items(): if not cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['cov'][blast_lib][cov_lib]={} if not bestTax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['cov'][blast_lib][cov_lib][bestTax]['raw'].append(cov) for cov_lib, cov in blob.covs.items(): self.stats['total_cov'][cov_lib]['raw'].append(cov) for blast_lib in self.blast_libs: # calculate N50 for tax, list_of_lengths in self.stats['lengths'][blast_lib].items(): if not blast_lib in self.stats['n50']: self.stats['n50'][blast_lib] = {} self.stats['n50'][blast_lib][tax] = n50(list_of_lengths) self.stats['total_n50'] = n50(self.stats['total_lengths']) # calculate total gc mean/stdev for tax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['gc'][blast_lib][tax]['raw'])) self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['gc'][blast_lib][tax]['raw'])) # calculate total cov mean/stdev for cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_cov'][cov_lib]['raw'])) self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_cov'][cov_lib]['raw'])) # calculate tax-specific cov mean/stdev for tax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['cov'][blast_lib][cov_lib][tax]['raw'])) self.stats['cov'][blast_lib][cov_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['cov'][blast_lib][cov_lib][tax]['raw'])) self.stats['total_gc']['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_gc']['raw'])) self.stats['total_gc']['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_gc']['raw']))