Ejemplo n.º 1
0
    def getStats(self):
        '''
		Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... 
		'''
        self.stats['count'] = {}
        self.stats['span'] = {}
        self.stats['n50'] = {}
        self.stats['lengths'] = {}
        self.stats['gc'] = {}
        self.stats['cov'] = {}
        self.stats['total_count'] = 0
        self.stats['total_span'] = 0
        self.stats['total_n50'] = 0
        self.stats['total_lengths'] = []
        self.stats['total_cov'] = {}
        self.stats['total_gc'] = {'raw': [], 'mean': 0.0, 'stdev': 0.0}
        self.stats['cov_libs'] = []

        for contig_name in self.contigs:
            blob = self.contigs[contig_name]
            self.stats['total_count'] += 1
            self.stats['total_span'] += blob.length
            self.stats['total_lengths'].append(blob.length)
            self.stats['total_gc']['raw'].append(blob.gc)

            for blast_lib in self.blast_libs:

                bestTax = keyWithMaxVal(blob.tax[blast_lib])
                if not blast_lib in self.stats['count']:
                    self.stats['count'][blast_lib] = {}
                    self.stats['span'][blast_lib] = {}
                    self.stats['lengths'][blast_lib] = {}
                    self.stats['gc'][blast_lib] = {}
                    self.stats['cov'][blast_lib] = {}
                self.stats['count'][
                    blast_lib][bestTax] = self.stats['count'][blast_lib].get(
                        bestTax, 0) + 1
                self.stats['span'][
                    blast_lib][bestTax] = self.stats['span'][blast_lib].get(
                        bestTax, 0) + blob.length

                if not bestTax in self.stats['gc'][blast_lib]:
                    self.stats['gc'][blast_lib][bestTax] = {
                        'raw': [],
                        'mean': 0.0,
                        'stdev': 0.0
                    }
                    self.stats['lengths'][blast_lib][bestTax] = []
                self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc)
                self.stats['lengths'][blast_lib][bestTax].append(blob.length)

                for cov_lib, cov in blob.covs.items():
                    if not cov_lib in self.stats['cov'][blast_lib]:
                        self.stats['total_cov'][cov_lib] = {
                            'raw': [],
                            'mean': 0.0,
                            'stdev': 0.0
                        }
                        self.stats['cov'][blast_lib][cov_lib] = {}
                    if not bestTax in self.stats['cov'][blast_lib][cov_lib]:
                        self.stats['cov'][blast_lib][cov_lib][bestTax] = {
                            'raw': [],
                            'mean': 0.0,
                            'stdev': 0.0
                        }
                    self.stats['cov'][blast_lib][cov_lib][bestTax][
                        'raw'].append(cov)

            for cov_lib, cov in blob.covs.items():
                self.stats['total_cov'][cov_lib]['raw'].append(cov)

        for blast_lib in self.blast_libs:
            # calculate N50
            for tax, list_of_lengths in self.stats['lengths'][blast_lib].items(
            ):
                if not blast_lib in self.stats['n50']:
                    self.stats['n50'][blast_lib] = {}
                self.stats['n50'][blast_lib][tax] = n50(list_of_lengths)
            self.stats['total_n50'] = n50(self.stats['total_lengths'])

            # calculate total gc mean/stdev
            for tax in self.stats['gc'][blast_lib]:
                self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format(
                    numpy.mean(self.stats['gc'][blast_lib][tax]['raw']))
                self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format(
                    numpy.std(self.stats['gc'][blast_lib][tax]['raw']))

            # calculate total cov mean/stdev
            for cov_lib in self.stats['cov'][blast_lib]:
                self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format(
                    numpy.mean(self.stats['total_cov'][cov_lib]['raw']))
                self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format(
                    numpy.std(self.stats['total_cov'][cov_lib]['raw']))

                # calculate tax-specific cov mean/stdev
                for tax in self.stats['cov'][blast_lib][cov_lib]:
                    self.stats['cov'][blast_lib][cov_lib][tax][
                        'mean'] = "{0:.2f}".format(
                            numpy.mean(self.stats['cov'][blast_lib][cov_lib]
                                       [tax]['raw']))
                    self.stats['cov'][blast_lib][cov_lib][tax][
                        'stdev'] = "{0:.2f}".format(
                            numpy.std(self.stats['cov'][blast_lib][cov_lib]
                                      [tax]['raw']))

        self.stats['total_gc']['mean'] = "{0:.2f}".format(
            numpy.mean(self.stats['total_gc']['raw']))
        self.stats['total_gc']['stdev'] = "{0:.2f}".format(
            numpy.std(self.stats['total_gc']['raw']))
Ejemplo n.º 2
0
def plot(data, cov_data, outfile, title):
	""" Plotting function which gets masked data and plots to outfile"""

	rect_scatter, rect_histx, rect_histy, rect_legend = set_canvas()
	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	# Setting up plots and axes
	plt.figure(1, figsize=(35,35), dpi=400)

	axScatter = plt.axes(rect_scatter, axisbg=background_grey, yscale = 'log')
	axScatter = set_format_scatterplot(axScatter)
	axHistx = plt.axes(rect_histx, axisbg=background_grey)
	axHistx = set_format_hist_x(axHistx, axScatter)
	axHisty = plt.axes(rect_histy, axisbg=background_grey)
	axHisty = set_format_hist_y(axHisty, axScatter)
	axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
	axScatter.yaxis.get_major_ticks()[1].label1.set_visible(False)
	if (title):
		plt.suptitle(title, fontsize=35, verticalalignment='top')
	#plt.suptitle(out_file, fontsize=25, verticalalignment='bottom')
	
	axLegend = plt.axes(rect_legend, axisbg=white)
	axLegend.xaxis.set_major_locator(plt.NullLocator())
	axLegend.xaxis.set_major_formatter(nullfmt)
	axLegend.yaxis.set_major_locator(plt.NullLocator())
	axLegend.yaxis.set_major_formatter(nullfmt)
	#
	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

	# Setting bins for histograms
	top_bins = np.arange(0, 1, 0.01)
	right_bins = np.logspace(-2, (int(math.log(max_cov)) + 1), 200, base=10.0)

	# empty handles for big legend
	legend_handles = []
	legend_labels = []

	# change file name if span (should be in input parsing function)
	if hist_span:
		outfile += ".hist_span"
	else:
		outfile += ".hist_count"

	# counter necessary for multiplot so that PNGs are in order when sorted by name
	i = 0

	# initiate variables for plotting
	s, lw, alpha, color = 0, 0, 0, ''
	
	# Maybe make an STDOUT printing func?
	print "[STATUS] Plotting : " + outfile

	# for each phylum ... they are ordered
	for tax in tax_list:

		i += 1

		# get indices for those rows in data where the phylum == tax
		index_for_tax = np.where(data[:,3].astype(str) == tax, True, False)
		# count of contigs ... total number of contigs comes from previous step?
		number_of_contigs_for_tax = np.sum(index_for_tax)
		
		# uses number_of_contigs for checking whether plotting should be carried out ... maybe there is a better place for this ...
		if number_of_contigs_for_tax == 0:
			pass
		else:
			# sums span for phylum in mb and not ... do we need both?
			span_of_contigs_for_tax = np.sum(data[index_for_tax][:,1].astype(int))
			span_of_contigs_for_tax_in_mb = span_of_contigs_for_tax/1000000

			# create np_arrays for length, gc and cov for all contigs in phylum 
			len_array = data[index_for_tax][:,1].astype(int)
			gc_array = data[index_for_tax][:,2].astype(float)
			cov_array = cov_data[index_for_tax].astype(float)
			# generates label ... this should be turned into a table ...
			label = tax + " (" + "{:,}".format(number_of_contigs_for_tax) + "; " + "%.2f" % round(span_of_contigs_for_tax_in_mb,2) + "MB; " + "{:,}".format(n50(len_array)) + "nt)"

			# another status message
			print "\t" + label 
			s_array = []
			# ignore contig length ... maybe do this in input and set these params for plotting there ...
			if (ignore_contig_len):
				if tax == 'no-hit':
					s, lw, alpha, color = 15, 0.5, 0.5, grey
				else:
					s, lw, alpha, color = 65, 0.5, 1, color_dict[tax]
				s_array = [s for contig_length in len_array]
			else:
				if tax == 'no-hit':
					s, lw, alpha, color = 15, 0.5, 0.5, grey
				else:
					s, lw, alpha, color = 15, 0.5, 1, color_dict[tax]
				# these are the sizes for plotting with contig sizes
				s_array = [contig_length/s for contig_length in len_array]
			
			# making copies of gc/cov_array
			gc_hist_array = gc_array
			cov_hist_array = cov_array

			#######
			# if hist span ... 
			# 	make a new array ...
			# 	add to the array : (gc * len/1000) - 1
			# substitute old array with new array

			# set histogram labels depending on type ... can be set before ... 
			weights_array = len_array/1000 
			if (hist_span):
				axHistx.set_ylabel("Span (kb)")
				axHisty.set_xlabel("Span (kb)", rotation='horizontal')
			else:
				axHistx.set_ylabel("Count")
				axHisty.set_xlabel("Count", rotation='horizontal')
		
			# this should be set before ... or after? ... but only once 
			for xtick in axHisty.get_xticklabels(): # rotate text for ticks in cov histogram 
				xtick.set_rotation(270)

			# add text to legend ... label was build before ... could be a function
			legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=color))
			legend_labels.append(label)
			
			if (number_of_contigs_for_tax):
				if (hist_span):
					axHistx.hist(gc_hist_array, weights=weights_array , color = color, bins = top_bins, histtype='step', lw = 3)
					axHisty.hist(cov_hist_array, weights=weights_array , color = color, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
				else:			
					axHistx.hist(gc_hist_array, color = color, bins = top_bins, histtype='step', lw = 3)
					axHisty.hist(cov_hist_array , color = color, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
		
			axScatter.scatter(gc_array, cov_array, color = color, s = s_array, lw = lw, alpha=alpha, edgecolor=black, label=label)
		
			axLegend.axis('off')

			if (multi_plot): # MULTI-PLOT!!!
				axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=fontsize, frameon=True)
				plot_ref_legend(axScatter)
				#plt.savefig(outfile + "." + str(i) + "_"+ tax.replace("/","") + "." + fig_format, format=fig_format)
				plt.savefig(outfile + "." + str(i) + "_"+ tax.replace("/","") + "." + fig_format, format=fig_format)
	
	if ignore_contig_len:
		pass
	else: # print scale-legend
		plot_ref_legend(axScatter)

	axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=fontsize, frameon=True, loc=6 )		
	sys.stdout.write("Saving file " + outfile)
	plt.savefig(outfile + "." + fig_format, format=fig_format)
	plt.close()
	print " [Done]\n" 
Ejemplo n.º 3
0
	def getStats(self):
		'''
		Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... 
		'''
		self.stats['count'] = {}
		self.stats['span']= {}
		self.stats['n50']= {}
		self.stats['lengths']= {}
		self.stats['gc']= {}
		self.stats['cov'] = {}
		self.stats['total_count'] = 0
		self.stats['total_span'] = 0
		self.stats['total_n50'] = 0
		self.stats['total_lengths'] = []
		self.stats['total_cov'] = {}
		self.stats['total_gc'] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0}
		self.stats['cov_libs'] = []

		for contig_name in self.contigs:
			blob = self.contigs[contig_name]
			self.stats['total_count'] += 1
			self.stats['total_span'] += blob.length
			self.stats['total_lengths'].append(blob.length)
			self.stats['total_gc']['raw'].append(blob.gc)

			for blast_lib in self.blast_libs:
				
				bestTax = keyWithMaxVal(blob.tax[blast_lib])
				if not blast_lib in self.stats['count']:
					self.stats['count'][blast_lib] = {}
					self.stats['span'][blast_lib] = {}
					self.stats['lengths'][blast_lib] = {}
					self.stats['gc'][blast_lib] = {}
					self.stats['cov'][blast_lib] = {}
				self.stats['count'][blast_lib][bestTax] = self.stats['count'][blast_lib].get(bestTax, 0) + 1	
				self.stats['span'][blast_lib][bestTax] = self.stats['span'][blast_lib].get(bestTax, 0) + blob.length

				if not bestTax in self.stats['gc'][blast_lib]:
					self.stats['gc'][blast_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0}
					self.stats['lengths'][blast_lib][bestTax] = []	
				self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc) 
				self.stats['lengths'][blast_lib][bestTax].append(blob.length)
				
				for cov_lib, cov in blob.covs.items():
					if not cov_lib in self.stats['cov'][blast_lib]:
						self.stats['total_cov'][cov_lib] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0}
						self.stats['cov'][blast_lib][cov_lib]={}
					if not bestTax in self.stats['cov'][blast_lib][cov_lib]:
						self.stats['cov'][blast_lib][cov_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} 
					self.stats['cov'][blast_lib][cov_lib][bestTax]['raw'].append(cov)
			
			for cov_lib, cov in blob.covs.items():
				self.stats['total_cov'][cov_lib]['raw'].append(cov)

		for blast_lib in self.blast_libs:
			# calculate N50
			for tax, list_of_lengths in self.stats['lengths'][blast_lib].items():
				if not blast_lib in self.stats['n50']:
					self.stats['n50'][blast_lib] = {}
				self.stats['n50'][blast_lib][tax] = n50(list_of_lengths)
			self.stats['total_n50'] = n50(self.stats['total_lengths'])

			# calculate total gc mean/stdev
			for tax in self.stats['gc'][blast_lib]:
				self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['gc'][blast_lib][tax]['raw']))
				self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['gc'][blast_lib][tax]['raw']))

			# calculate total cov mean/stdev
			for cov_lib in self.stats['cov'][blast_lib]:
				self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_cov'][cov_lib]['raw']))
				self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_cov'][cov_lib]['raw']))
				
				# calculate tax-specific cov mean/stdev
				for tax in self.stats['cov'][blast_lib][cov_lib]:
					self.stats['cov'][blast_lib][cov_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['cov'][blast_lib][cov_lib][tax]['raw']))
					self.stats['cov'][blast_lib][cov_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['cov'][blast_lib][cov_lib][tax]['raw']))
		
		self.stats['total_gc']['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_gc']['raw']))
		self.stats['total_gc']['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_gc']['raw']))
Ejemplo n.º 4
0
def plot(data, cov_data, outfile, title):
    """ Plotting function which gets masked data and plots to outfile"""

    rect_scatter, rect_histx, rect_histy, rect_legend = set_canvas()
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Setting up plots and axes
    plt.figure(1, figsize=(35, 35), dpi=400)

    axScatter = plt.axes(rect_scatter, axisbg=background_grey, yscale='log')
    axScatter = set_format_scatterplot(axScatter)
    axHistx = plt.axes(rect_histx, axisbg=background_grey)
    axHistx = set_format_hist_x(axHistx, axScatter)
    axHisty = plt.axes(rect_histy, axisbg=background_grey)
    axHisty = set_format_hist_y(axHisty, axScatter)
    axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
    axScatter.yaxis.get_major_ticks()[1].label1.set_visible(False)
    if (title):
        plt.suptitle(title, fontsize=35, verticalalignment='top')
    #plt.suptitle(out_file, fontsize=25, verticalalignment='bottom')

    axLegend = plt.axes(rect_legend, axisbg=white)
    axLegend.xaxis.set_major_locator(plt.NullLocator())
    axLegend.xaxis.set_major_formatter(nullfmt)
    axLegend.yaxis.set_major_locator(plt.NullLocator())
    axLegend.yaxis.set_major_formatter(nullfmt)
    #
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    # Setting bins for histograms
    top_bins = np.arange(0, 1, 0.01)
    right_bins = np.logspace(-2, (int(math.log(max_cov)) + 1), 200, base=10.0)

    # empty handles for big legend
    legend_handles = []
    legend_labels = []

    # change file name if span (should be in input parsing function)
    if hist_span:
        outfile += ".hist_span"
    else:
        outfile += ".hist_count"

    # counter necessary for multiplot so that PNGs are in order when sorted by name
    i = 0

    # initiate variables for plotting
    s, lw, alpha, color = 0, 0, 0, ''

    # Maybe make an STDOUT printing func?
    print "[STATUS] Plotting : " + outfile

    # for each phylum ... they are ordered
    for tax in tax_list:

        i += 1

        # get indices for those rows in data where the phylum == tax
        index_for_tax = np.where(data[:, 3].astype(str) == tax, True, False)
        # count of contigs ... total number of contigs comes from previous step?
        number_of_contigs_for_tax = np.sum(index_for_tax)

        # uses number_of_contigs for checking whether plotting should be carried out ... maybe there is a better place for this ...
        if number_of_contigs_for_tax == 0:
            pass
        else:
            # sums span for phylum in mb and not ... do we need both?
            span_of_contigs_for_tax = np.sum(
                data[index_for_tax][:, 1].astype(int))
            span_of_contigs_for_tax_in_mb = span_of_contigs_for_tax / 1000000

            # create np_arrays for length, gc and cov for all contigs in phylum
            len_array = data[index_for_tax][:, 1].astype(int)
            gc_array = data[index_for_tax][:, 2].astype(float)
            cov_array = cov_data[index_for_tax].astype(float)
            # generates label ... this should be turned into a table ...
            label = tax + " (" + "{:,}".format(
                number_of_contigs_for_tax) + "; " + "%.2f" % round(
                    span_of_contigs_for_tax_in_mb, 2) + "MB; " + "{:,}".format(
                        n50(len_array)) + "nt)"

            # another status message
            print "\t" + label
            s_array = []
            # ignore contig length ... maybe do this in input and set these params for plotting there ...
            if (ignore_contig_len):
                if tax == 'no-hit':
                    s, lw, alpha, color = 15, 0.5, 0.5, grey
                else:
                    s, lw, alpha, color = 65, 0.5, 1, color_dict[tax]
                s_array = [s for contig_length in len_array]
            else:
                if tax == 'no-hit':
                    s, lw, alpha, color = 15, 0.5, 0.5, grey
                else:
                    s, lw, alpha, color = 15, 0.5, 1, color_dict[tax]
                # these are the sizes for plotting with contig sizes
                s_array = [contig_length / s for contig_length in len_array]

            # making copies of gc/cov_array
            gc_hist_array = gc_array
            cov_hist_array = cov_array

            #######
            # if hist span ...
            # 	make a new array ...
            # 	add to the array : (gc * len/1000) - 1
            # substitute old array with new array

            # set histogram labels depending on type ... can be set before ...
            weights_array = len_array / 1000
            if (hist_span):
                axHistx.set_ylabel("Span (kb)")
                axHisty.set_xlabel("Span (kb)", rotation='horizontal')
            else:
                axHistx.set_ylabel("Count")
                axHisty.set_xlabel("Count", rotation='horizontal')

            # this should be set before ... or after? ... but only once
            for xtick in axHisty.get_xticklabels(
            ):  # rotate text for ticks in cov histogram
                xtick.set_rotation(270)

            # add text to legend ... label was build before ... could be a function
            legend_handles.append(
                Line2D([0], [0],
                       linewidth=0.5,
                       linestyle="none",
                       marker="o",
                       alpha=1,
                       markersize=24,
                       markerfacecolor=color))
            legend_labels.append(label)

            if (number_of_contigs_for_tax):
                if (hist_span):
                    axHistx.hist(gc_hist_array,
                                 weights=weights_array,
                                 color=color,
                                 bins=top_bins,
                                 histtype='step',
                                 lw=3)
                    axHisty.hist(cov_hist_array,
                                 weights=weights_array,
                                 color=color,
                                 bins=right_bins,
                                 histtype='step',
                                 orientation='horizontal',
                                 lw=3)
                else:
                    axHistx.hist(gc_hist_array,
                                 color=color,
                                 bins=top_bins,
                                 histtype='step',
                                 lw=3)
                    axHisty.hist(cov_hist_array,
                                 color=color,
                                 bins=right_bins,
                                 histtype='step',
                                 orientation='horizontal',
                                 lw=3)

            axScatter.scatter(gc_array,
                              cov_array,
                              color=color,
                              s=s_array,
                              lw=lw,
                              alpha=alpha,
                              edgecolor=black,
                              label=label)

            axLegend.axis('off')

            if (multi_plot):  # MULTI-PLOT!!!
                axLegend.legend(legend_handles,
                                legend_labels,
                                loc=6,
                                numpoints=1,
                                fontsize=fontsize,
                                frameon=True)
                plot_ref_legend(axScatter)
                #plt.savefig(outfile + "." + str(i) + "_"+ tax.replace("/","") + "." + fig_format, format=fig_format)
                plt.savefig(outfile + "." + str(i) + "_" +
                            tax.replace("/", "") + "." + fig_format,
                            format=fig_format)

    if ignore_contig_len:
        pass
    else:  # print scale-legend
        plot_ref_legend(axScatter)

    axLegend.legend(legend_handles,
                    legend_labels,
                    numpoints=1,
                    fontsize=fontsize,
                    frameon=True,
                    loc=6)
    sys.stdout.write("Saving file " + outfile)
    plt.savefig(outfile + "." + fig_format, format=fig_format)
    plt.close()
    print " [Done]\n"