Ejemplo n.º 1
0
	def getConsensusTaxForBlobs(self, taxrule, blast_order):
		''' 
		- Based on taxrule ("A" or "B") and the blast_order (list in order in which blast files where specified) 
		it calculates the consensus taxonomy for each blob 
		- if taxrule == A:
			- it puts all taxonomic groups in a dict with their summed scores as values
			- if a taxonomic group occurs in hits of more than one BLAST file, the highest score is used
		- if taxrule == B:
			- taxonomic groups are put in the dict with their summed scores as values IF they come from the first BLAST file
			- If there was no hit then take the taxonomic groups from the next one  	
		- The highest scoring taxonomic group is selected as consensus taxonomy for each blob
		'''
		for contig_name in self.contigs:
			dict_for_tax_merging = {}
			for blast_lib in blast_order:
				for tax, score in sorted(self.contigs[contig_name].tax[blast_lib].items(), key=lambda x: x[1], reverse=True):
					# loops through tax/score with decreasing score
					if taxrule == 'A':
						if not tax in dict_for_tax_merging:
							dict_for_tax_merging[tax] = score
						else:
							if score > dict_for_tax_merging[tax]:
								dict_for_tax_merging[tax] = score 
					elif taxrule == 'B':
						if blast_lib == blast_order[0]:
							# First blast_lib
							dict_for_tax_merging[tax] = score
						else:
							if len(dict_for_tax_merging) <= 1 and ('no-hit' in dict_for_tax_merging):
								dict_for_tax_merging[tax] = score	

			tax = keyWithMaxVal(dict_for_tax_merging)
			self.contigs[contig_name].tax['tax'] = {}
			self.contigs[contig_name].tax['tax'][tax]=dict_for_tax_merging[tax]
		self.blast_libs.append('tax')
Ejemplo n.º 2
0
    def getConsensusTaxForBlobs(self, taxrule, blast_order):
        ''' 
		- Based on taxrule ("A" or "B") and the blast_order (list in order in which blast files where specified) 
		it calculates the consensus taxonomy for each blob 
		- if taxrule == A:
			- it puts all taxonomic groups in a dict with their summed scores as values
			- if a taxonomic group occurs in hits of more than one BLAST file, the highest score is used
		- if taxrule == B:
			- taxonomic groups are put in the dict with their summed scores as values IF they come from the first BLAST file
			- If there was no hit then take the taxonomic groups from the next one  	
		- The highest scoring taxonomic group is selected as consensus taxonomy for each blob
		'''
        for contig_name in self.contigs:
            dict_for_tax_merging = {}
            for blast_lib in blast_order:
                for tax, score in sorted(
                        self.contigs[contig_name].tax[blast_lib].items(),
                        key=lambda x: x[1],
                        reverse=True):
                    # loops through tax/score with decreasing score
                    if taxrule == 'A':
                        if not tax in dict_for_tax_merging:
                            dict_for_tax_merging[tax] = score
                        else:
                            if score > dict_for_tax_merging[tax]:
                                dict_for_tax_merging[tax] = score
                    elif taxrule == 'B':
                        if blast_lib == blast_order[0]:
                            # First blast_lib
                            dict_for_tax_merging[tax] = score
                        else:
                            if len(dict_for_tax_merging) <= 1 and (
                                    'no-hit' in dict_for_tax_merging):
                                dict_for_tax_merging[tax] = score

            tax = keyWithMaxVal(dict_for_tax_merging)
            self.contigs[contig_name].tax['tax'] = {}
            self.contigs[contig_name].tax['tax'][tax] = dict_for_tax_merging[
                tax]
        self.blast_libs.append('tax')
Ejemplo n.º 3
0
    def getStats(self):
        '''
		Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... 
		'''
        self.stats['count'] = {}
        self.stats['span'] = {}
        self.stats['n50'] = {}
        self.stats['lengths'] = {}
        self.stats['gc'] = {}
        self.stats['cov'] = {}
        self.stats['total_count'] = 0
        self.stats['total_span'] = 0
        self.stats['total_n50'] = 0
        self.stats['total_lengths'] = []
        self.stats['total_cov'] = {}
        self.stats['total_gc'] = {'raw': [], 'mean': 0.0, 'stdev': 0.0}
        self.stats['cov_libs'] = []

        for contig_name in self.contigs:
            blob = self.contigs[contig_name]
            self.stats['total_count'] += 1
            self.stats['total_span'] += blob.length
            self.stats['total_lengths'].append(blob.length)
            self.stats['total_gc']['raw'].append(blob.gc)

            for blast_lib in self.blast_libs:

                bestTax = keyWithMaxVal(blob.tax[blast_lib])
                if not blast_lib in self.stats['count']:
                    self.stats['count'][blast_lib] = {}
                    self.stats['span'][blast_lib] = {}
                    self.stats['lengths'][blast_lib] = {}
                    self.stats['gc'][blast_lib] = {}
                    self.stats['cov'][blast_lib] = {}
                self.stats['count'][
                    blast_lib][bestTax] = self.stats['count'][blast_lib].get(
                        bestTax, 0) + 1
                self.stats['span'][
                    blast_lib][bestTax] = self.stats['span'][blast_lib].get(
                        bestTax, 0) + blob.length

                if not bestTax in self.stats['gc'][blast_lib]:
                    self.stats['gc'][blast_lib][bestTax] = {
                        'raw': [],
                        'mean': 0.0,
                        'stdev': 0.0
                    }
                    self.stats['lengths'][blast_lib][bestTax] = []
                self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc)
                self.stats['lengths'][blast_lib][bestTax].append(blob.length)

                for cov_lib, cov in blob.covs.items():
                    if not cov_lib in self.stats['cov'][blast_lib]:
                        self.stats['total_cov'][cov_lib] = {
                            'raw': [],
                            'mean': 0.0,
                            'stdev': 0.0
                        }
                        self.stats['cov'][blast_lib][cov_lib] = {}
                    if not bestTax in self.stats['cov'][blast_lib][cov_lib]:
                        self.stats['cov'][blast_lib][cov_lib][bestTax] = {
                            'raw': [],
                            'mean': 0.0,
                            'stdev': 0.0
                        }
                    self.stats['cov'][blast_lib][cov_lib][bestTax][
                        'raw'].append(cov)

            for cov_lib, cov in blob.covs.items():
                self.stats['total_cov'][cov_lib]['raw'].append(cov)

        for blast_lib in self.blast_libs:
            # calculate N50
            for tax, list_of_lengths in self.stats['lengths'][blast_lib].items(
            ):
                if not blast_lib in self.stats['n50']:
                    self.stats['n50'][blast_lib] = {}
                self.stats['n50'][blast_lib][tax] = n50(list_of_lengths)
            self.stats['total_n50'] = n50(self.stats['total_lengths'])

            # calculate total gc mean/stdev
            for tax in self.stats['gc'][blast_lib]:
                self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format(
                    numpy.mean(self.stats['gc'][blast_lib][tax]['raw']))
                self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format(
                    numpy.std(self.stats['gc'][blast_lib][tax]['raw']))

            # calculate total cov mean/stdev
            for cov_lib in self.stats['cov'][blast_lib]:
                self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format(
                    numpy.mean(self.stats['total_cov'][cov_lib]['raw']))
                self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format(
                    numpy.std(self.stats['total_cov'][cov_lib]['raw']))

                # calculate tax-specific cov mean/stdev
                for tax in self.stats['cov'][blast_lib][cov_lib]:
                    self.stats['cov'][blast_lib][cov_lib][tax][
                        'mean'] = "{0:.2f}".format(
                            numpy.mean(self.stats['cov'][blast_lib][cov_lib]
                                       [tax]['raw']))
                    self.stats['cov'][blast_lib][cov_lib][tax][
                        'stdev'] = "{0:.2f}".format(
                            numpy.std(self.stats['cov'][blast_lib][cov_lib]
                                      [tax]['raw']))

        self.stats['total_gc']['mean'] = "{0:.2f}".format(
            numpy.mean(self.stats['total_gc']['raw']))
        self.stats['total_gc']['stdev'] = "{0:.2f}".format(
            numpy.std(self.stats['total_gc']['raw']))
Ejemplo n.º 4
0
	def getStats(self):
		'''
		Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... 
		'''
		self.stats['count'] = {}
		self.stats['span']= {}
		self.stats['n50']= {}
		self.stats['lengths']= {}
		self.stats['gc']= {}
		self.stats['cov'] = {}
		self.stats['total_count'] = 0
		self.stats['total_span'] = 0
		self.stats['total_n50'] = 0
		self.stats['total_lengths'] = []
		self.stats['total_cov'] = {}
		self.stats['total_gc'] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0}
		self.stats['cov_libs'] = []

		for contig_name in self.contigs:
			blob = self.contigs[contig_name]
			self.stats['total_count'] += 1
			self.stats['total_span'] += blob.length
			self.stats['total_lengths'].append(blob.length)
			self.stats['total_gc']['raw'].append(blob.gc)

			for blast_lib in self.blast_libs:
				
				bestTax = keyWithMaxVal(blob.tax[blast_lib])
				if not blast_lib in self.stats['count']:
					self.stats['count'][blast_lib] = {}
					self.stats['span'][blast_lib] = {}
					self.stats['lengths'][blast_lib] = {}
					self.stats['gc'][blast_lib] = {}
					self.stats['cov'][blast_lib] = {}
				self.stats['count'][blast_lib][bestTax] = self.stats['count'][blast_lib].get(bestTax, 0) + 1	
				self.stats['span'][blast_lib][bestTax] = self.stats['span'][blast_lib].get(bestTax, 0) + blob.length

				if not bestTax in self.stats['gc'][blast_lib]:
					self.stats['gc'][blast_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0}
					self.stats['lengths'][blast_lib][bestTax] = []	
				self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc) 
				self.stats['lengths'][blast_lib][bestTax].append(blob.length)
				
				for cov_lib, cov in blob.covs.items():
					if not cov_lib in self.stats['cov'][blast_lib]:
						self.stats['total_cov'][cov_lib] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0}
						self.stats['cov'][blast_lib][cov_lib]={}
					if not bestTax in self.stats['cov'][blast_lib][cov_lib]:
						self.stats['cov'][blast_lib][cov_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} 
					self.stats['cov'][blast_lib][cov_lib][bestTax]['raw'].append(cov)
			
			for cov_lib, cov in blob.covs.items():
				self.stats['total_cov'][cov_lib]['raw'].append(cov)

		for blast_lib in self.blast_libs:
			# calculate N50
			for tax, list_of_lengths in self.stats['lengths'][blast_lib].items():
				if not blast_lib in self.stats['n50']:
					self.stats['n50'][blast_lib] = {}
				self.stats['n50'][blast_lib][tax] = n50(list_of_lengths)
			self.stats['total_n50'] = n50(self.stats['total_lengths'])

			# calculate total gc mean/stdev
			for tax in self.stats['gc'][blast_lib]:
				self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['gc'][blast_lib][tax]['raw']))
				self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['gc'][blast_lib][tax]['raw']))

			# calculate total cov mean/stdev
			for cov_lib in self.stats['cov'][blast_lib]:
				self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_cov'][cov_lib]['raw']))
				self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_cov'][cov_lib]['raw']))
				
				# calculate tax-specific cov mean/stdev
				for tax in self.stats['cov'][blast_lib][cov_lib]:
					self.stats['cov'][blast_lib][cov_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['cov'][blast_lib][cov_lib][tax]['raw']))
					self.stats['cov'][blast_lib][cov_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['cov'][blast_lib][cov_lib][tax]['raw']))
		
		self.stats['total_gc']['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_gc']['raw']))
		self.stats['total_gc']['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_gc']['raw']))