def getStats(self): ''' Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... ''' self.stats['count'] = {} self.stats['span'] = {} self.stats['n50'] = {} self.stats['lengths'] = {} self.stats['gc'] = {} self.stats['cov'] = {} self.stats['total_count'] = 0 self.stats['total_span'] = 0 self.stats['total_n50'] = 0 self.stats['total_lengths'] = [] self.stats['total_cov'] = {} self.stats['total_gc'] = {'raw': [], 'mean': 0.0, 'stdev': 0.0} self.stats['cov_libs'] = [] for contig_name in self.contigs: blob = self.contigs[contig_name] self.stats['total_count'] += 1 self.stats['total_span'] += blob.length self.stats['total_lengths'].append(blob.length) self.stats['total_gc']['raw'].append(blob.gc) for blast_lib in self.blast_libs: bestTax = keyWithMaxVal([blast_lib]) if not blast_lib in self.stats['count']: self.stats['count'][blast_lib] = {} self.stats['span'][blast_lib] = {} self.stats['lengths'][blast_lib] = {} self.stats['gc'][blast_lib] = {} self.stats['cov'][blast_lib] = {} self.stats['count'][ blast_lib][bestTax] = self.stats['count'][blast_lib].get( bestTax, 0) + 1 self.stats['span'][ blast_lib][bestTax] = self.stats['span'][blast_lib].get( bestTax, 0) + blob.length if not bestTax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][bestTax] = { 'raw': [], 'mean': 0.0, 'stdev': 0.0 } self.stats['lengths'][blast_lib][bestTax] = [] self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc) self.stats['lengths'][blast_lib][bestTax].append(blob.length) for cov_lib, cov in blob.covs.items(): if not cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib] = { 'raw': [], 'mean': 0.0, 'stdev': 0.0 } self.stats['cov'][blast_lib][cov_lib] = {} if not bestTax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][bestTax] = { 'raw': [], 'mean': 0.0, 'stdev': 0.0 } self.stats['cov'][blast_lib][cov_lib][bestTax][ 'raw'].append(cov) for cov_lib, cov in blob.covs.items(): self.stats['total_cov'][cov_lib]['raw'].append(cov) for blast_lib in self.blast_libs: # calculate N50 for tax, list_of_lengths in self.stats['lengths'][blast_lib].items( ): if not blast_lib in self.stats['n50']: self.stats['n50'][blast_lib] = {} self.stats['n50'][blast_lib][tax] = n50(list_of_lengths) self.stats['total_n50'] = n50(self.stats['total_lengths']) # calculate total gc mean/stdev for tax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format( numpy.mean(self.stats['gc'][blast_lib][tax]['raw'])) self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format( numpy.std(self.stats['gc'][blast_lib][tax]['raw'])) # calculate total cov mean/stdev for cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format( numpy.mean(self.stats['total_cov'][cov_lib]['raw'])) self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format( numpy.std(self.stats['total_cov'][cov_lib]['raw'])) # calculate tax-specific cov mean/stdev for tax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][tax][ 'mean'] = "{0:.2f}".format( numpy.mean(self.stats['cov'][blast_lib][cov_lib] [tax]['raw'])) self.stats['cov'][blast_lib][cov_lib][tax][ 'stdev'] = "{0:.2f}".format( numpy.std(self.stats['cov'][blast_lib][cov_lib] [tax]['raw'])) self.stats['total_gc']['mean'] = "{0:.2f}".format( numpy.mean(self.stats['total_gc']['raw'])) self.stats['total_gc']['stdev'] = "{0:.2f}".format( numpy.std(self.stats['total_gc']['raw']))
def plot(data, cov_data, outfile, title): """ Plotting function which gets masked data and plots to outfile""" rect_scatter, rect_histx, rect_histy, rect_legend = set_canvas() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Setting up plots and axes plt.figure(1, figsize=(35,35), dpi=400) axScatter = plt.axes(rect_scatter, axisbg=background_grey, yscale = 'log') axScatter = set_format_scatterplot(axScatter) axHistx = plt.axes(rect_histx, axisbg=background_grey) axHistx = set_format_hist_x(axHistx, axScatter) axHisty = plt.axes(rect_histy, axisbg=background_grey) axHisty = set_format_hist_y(axHisty, axScatter) axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False) axScatter.yaxis.get_major_ticks()[1].label1.set_visible(False) if (title): plt.suptitle(title, fontsize=35, verticalalignment='top') #plt.suptitle(out_file, fontsize=25, verticalalignment='bottom') axLegend = plt.axes(rect_legend, axisbg=white) axLegend.xaxis.set_major_locator(plt.NullLocator()) axLegend.xaxis.set_major_formatter(nullfmt) axLegend.yaxis.set_major_locator(plt.NullLocator()) axLegend.yaxis.set_major_formatter(nullfmt) # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Setting bins for histograms top_bins = np.arange(0, 1, 0.01) right_bins = np.logspace(-2, (int(math.log(max_cov)) + 1), 200, base=10.0) # empty handles for big legend legend_handles = [] legend_labels = [] # change file name if span (should be in input parsing function) if hist_span: outfile += ".hist_span" else: outfile += ".hist_count" # counter necessary for multiplot so that PNGs are in order when sorted by name i = 0 # initiate variables for plotting s, lw, alpha, color = 0, 0, 0, '' # Maybe make an STDOUT printing func? print "[STATUS] Plotting : " + outfile # for each phylum ... they are ordered for tax in tax_list: i += 1 # get indices for those rows in data where the phylum == tax index_for_tax = np.where(data[:,3].astype(str) == tax, True, False) # count of contigs ... total number of contigs comes from previous step? number_of_contigs_for_tax = np.sum(index_for_tax) # uses number_of_contigs for checking whether plotting should be carried out ... maybe there is a better place for this ... if number_of_contigs_for_tax == 0: pass else: # sums span for phylum in mb and not ... do we need both? span_of_contigs_for_tax = np.sum(data[index_for_tax][:,1].astype(int)) span_of_contigs_for_tax_in_mb = span_of_contigs_for_tax/1000000 # create np_arrays for length, gc and cov for all contigs in phylum len_array = data[index_for_tax][:,1].astype(int) gc_array = data[index_for_tax][:,2].astype(float) cov_array = cov_data[index_for_tax].astype(float) # generates label ... this should be turned into a table ... label = tax + " (" + "{:,}".format(number_of_contigs_for_tax) + "; " + "%.2f" % round(span_of_contigs_for_tax_in_mb,2) + "MB; " + "{:,}".format(n50(len_array)) + "nt)" # another status message print "\t" + label s_array = [] # ignore contig length ... maybe do this in input and set these params for plotting there ... if (ignore_contig_len): if tax == 'no-hit': s, lw, alpha, color = 15, 0.5, 0.5, grey else: s, lw, alpha, color = 65, 0.5, 1, color_dict[tax] s_array = [s for contig_length in len_array] else: if tax == 'no-hit': s, lw, alpha, color = 15, 0.5, 0.5, grey else: s, lw, alpha, color = 15, 0.5, 1, color_dict[tax] # these are the sizes for plotting with contig sizes s_array = [contig_length/s for contig_length in len_array] # making copies of gc/cov_array gc_hist_array = gc_array cov_hist_array = cov_array ####### # if hist span ... # make a new array ... # add to the array : (gc * len/1000) - 1 # substitute old array with new array # set histogram labels depending on type ... can be set before ... weights_array = len_array/1000 if (hist_span): axHistx.set_ylabel("Span (kb)") axHisty.set_xlabel("Span (kb)", rotation='horizontal') else: axHistx.set_ylabel("Count") axHisty.set_xlabel("Count", rotation='horizontal') # this should be set before ... or after? ... but only once for xtick in axHisty.get_xticklabels(): # rotate text for ticks in cov histogram xtick.set_rotation(270) # add text to legend ... label was build before ... could be a function legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=color)) legend_labels.append(label) if (number_of_contigs_for_tax): if (hist_span): axHistx.hist(gc_hist_array, weights=weights_array , color = color, bins = top_bins, histtype='step', lw = 3) axHisty.hist(cov_hist_array, weights=weights_array , color = color, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) else: axHistx.hist(gc_hist_array, color = color, bins = top_bins, histtype='step', lw = 3) axHisty.hist(cov_hist_array , color = color, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) axScatter.scatter(gc_array, cov_array, color = color, s = s_array, lw = lw, alpha=alpha, edgecolor=black, label=label) axLegend.axis('off') if (multi_plot): # MULTI-PLOT!!! axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=fontsize, frameon=True) plot_ref_legend(axScatter) #plt.savefig(outfile + "." + str(i) + "_"+ tax.replace("/","") + "." + fig_format, format=fig_format) plt.savefig(outfile + "." + str(i) + "_"+ tax.replace("/","") + "." + fig_format, format=fig_format) if ignore_contig_len: pass else: # print scale-legend plot_ref_legend(axScatter) axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=fontsize, frameon=True, loc=6 ) sys.stdout.write("Saving file " + outfile) plt.savefig(outfile + "." + fig_format, format=fig_format) plt.close() print " [Done]\n"
def getStats(self): ''' Calculates all different kind of stats for each taxonomic group based on each BLAST file and the consensus... ''' self.stats['count'] = {} self.stats['span']= {} self.stats['n50']= {} self.stats['lengths']= {} self.stats['gc']= {} self.stats['cov'] = {} self.stats['total_count'] = 0 self.stats['total_span'] = 0 self.stats['total_n50'] = 0 self.stats['total_lengths'] = [] self.stats['total_cov'] = {} self.stats['total_gc'] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['cov_libs'] = [] for contig_name in self.contigs: blob = self.contigs[contig_name] self.stats['total_count'] += 1 self.stats['total_span'] += blob.length self.stats['total_lengths'].append(blob.length) self.stats['total_gc']['raw'].append(blob.gc) for blast_lib in self.blast_libs: bestTax = keyWithMaxVal([blast_lib]) if not blast_lib in self.stats['count']: self.stats['count'][blast_lib] = {} self.stats['span'][blast_lib] = {} self.stats['lengths'][blast_lib] = {} self.stats['gc'][blast_lib] = {} self.stats['cov'][blast_lib] = {} self.stats['count'][blast_lib][bestTax] = self.stats['count'][blast_lib].get(bestTax, 0) + 1 self.stats['span'][blast_lib][bestTax] = self.stats['span'][blast_lib].get(bestTax, 0) + blob.length if not bestTax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['lengths'][blast_lib][bestTax] = [] self.stats['gc'][blast_lib][bestTax]['raw'].append(blob.gc) self.stats['lengths'][blast_lib][bestTax].append(blob.length) for cov_lib, cov in blob.covs.items(): if not cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['cov'][blast_lib][cov_lib]={} if not bestTax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][bestTax] = {'raw' : [], 'mean' : 0.0, 'stdev' : 0.0} self.stats['cov'][blast_lib][cov_lib][bestTax]['raw'].append(cov) for cov_lib, cov in blob.covs.items(): self.stats['total_cov'][cov_lib]['raw'].append(cov) for blast_lib in self.blast_libs: # calculate N50 for tax, list_of_lengths in self.stats['lengths'][blast_lib].items(): if not blast_lib in self.stats['n50']: self.stats['n50'][blast_lib] = {} self.stats['n50'][blast_lib][tax] = n50(list_of_lengths) self.stats['total_n50'] = n50(self.stats['total_lengths']) # calculate total gc mean/stdev for tax in self.stats['gc'][blast_lib]: self.stats['gc'][blast_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['gc'][blast_lib][tax]['raw'])) self.stats['gc'][blast_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['gc'][blast_lib][tax]['raw'])) # calculate total cov mean/stdev for cov_lib in self.stats['cov'][blast_lib]: self.stats['total_cov'][cov_lib]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_cov'][cov_lib]['raw'])) self.stats['total_cov'][cov_lib]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_cov'][cov_lib]['raw'])) # calculate tax-specific cov mean/stdev for tax in self.stats['cov'][blast_lib][cov_lib]: self.stats['cov'][blast_lib][cov_lib][tax]['mean'] = "{0:.2f}".format(numpy.mean(self.stats['cov'][blast_lib][cov_lib][tax]['raw'])) self.stats['cov'][blast_lib][cov_lib][tax]['stdev'] = "{0:.2f}".format(numpy.std(self.stats['cov'][blast_lib][cov_lib][tax]['raw'])) self.stats['total_gc']['mean'] = "{0:.2f}".format(numpy.mean(self.stats['total_gc']['raw'])) self.stats['total_gc']['stdev'] = "{0:.2f}".format(numpy.std(self.stats['total_gc']['raw']))
def plot(data, cov_data, outfile, title): """ Plotting function which gets masked data and plots to outfile""" rect_scatter, rect_histx, rect_histy, rect_legend = set_canvas() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Setting up plots and axes plt.figure(1, figsize=(35, 35), dpi=400) axScatter = plt.axes(rect_scatter, axisbg=background_grey, yscale='log') axScatter = set_format_scatterplot(axScatter) axHistx = plt.axes(rect_histx, axisbg=background_grey) axHistx = set_format_hist_x(axHistx, axScatter) axHisty = plt.axes(rect_histy, axisbg=background_grey) axHisty = set_format_hist_y(axHisty, axScatter) axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False) axScatter.yaxis.get_major_ticks()[1].label1.set_visible(False) if (title): plt.suptitle(title, fontsize=35, verticalalignment='top') #plt.suptitle(out_file, fontsize=25, verticalalignment='bottom') axLegend = plt.axes(rect_legend, axisbg=white) axLegend.xaxis.set_major_locator(plt.NullLocator()) axLegend.xaxis.set_major_formatter(nullfmt) axLegend.yaxis.set_major_locator(plt.NullLocator()) axLegend.yaxis.set_major_formatter(nullfmt) # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Setting bins for histograms top_bins = np.arange(0, 1, 0.01) right_bins = np.logspace(-2, (int(math.log(max_cov)) + 1), 200, base=10.0) # empty handles for big legend legend_handles = [] legend_labels = [] # change file name if span (should be in input parsing function) if hist_span: outfile += ".hist_span" else: outfile += ".hist_count" # counter necessary for multiplot so that PNGs are in order when sorted by name i = 0 # initiate variables for plotting s, lw, alpha, color = 0, 0, 0, '' # Maybe make an STDOUT printing func? print "[STATUS] Plotting : " + outfile # for each phylum ... they are ordered for tax in tax_list: i += 1 # get indices for those rows in data where the phylum == tax index_for_tax = np.where(data[:, 3].astype(str) == tax, True, False) # count of contigs ... total number of contigs comes from previous step? number_of_contigs_for_tax = np.sum(index_for_tax) # uses number_of_contigs for checking whether plotting should be carried out ... maybe there is a better place for this ... if number_of_contigs_for_tax == 0: pass else: # sums span for phylum in mb and not ... do we need both? span_of_contigs_for_tax = np.sum( data[index_for_tax][:, 1].astype(int)) span_of_contigs_for_tax_in_mb = span_of_contigs_for_tax / 1000000 # create np_arrays for length, gc and cov for all contigs in phylum len_array = data[index_for_tax][:, 1].astype(int) gc_array = data[index_for_tax][:, 2].astype(float) cov_array = cov_data[index_for_tax].astype(float) # generates label ... this should be turned into a table ... label = tax + " (" + "{:,}".format( number_of_contigs_for_tax) + "; " + "%.2f" % round( span_of_contigs_for_tax_in_mb, 2) + "MB; " + "{:,}".format( n50(len_array)) + "nt)" # another status message print "\t" + label s_array = [] # ignore contig length ... maybe do this in input and set these params for plotting there ... if (ignore_contig_len): if tax == 'no-hit': s, lw, alpha, color = 15, 0.5, 0.5, grey else: s, lw, alpha, color = 65, 0.5, 1, color_dict[tax] s_array = [s for contig_length in len_array] else: if tax == 'no-hit': s, lw, alpha, color = 15, 0.5, 0.5, grey else: s, lw, alpha, color = 15, 0.5, 1, color_dict[tax] # these are the sizes for plotting with contig sizes s_array = [contig_length / s for contig_length in len_array] # making copies of gc/cov_array gc_hist_array = gc_array cov_hist_array = cov_array ####### # if hist span ... # make a new array ... # add to the array : (gc * len/1000) - 1 # substitute old array with new array # set histogram labels depending on type ... can be set before ... weights_array = len_array / 1000 if (hist_span): axHistx.set_ylabel("Span (kb)") axHisty.set_xlabel("Span (kb)", rotation='horizontal') else: axHistx.set_ylabel("Count") axHisty.set_xlabel("Count", rotation='horizontal') # this should be set before ... or after? ... but only once for xtick in axHisty.get_xticklabels( ): # rotate text for ticks in cov histogram xtick.set_rotation(270) # add text to legend ... label was build before ... could be a function legend_handles.append( Line2D([0], [0], linewidth=0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=color)) legend_labels.append(label) if (number_of_contigs_for_tax): if (hist_span): axHistx.hist(gc_hist_array, weights=weights_array, color=color, bins=top_bins, histtype='step', lw=3) axHisty.hist(cov_hist_array, weights=weights_array, color=color, bins=right_bins, histtype='step', orientation='horizontal', lw=3) else: axHistx.hist(gc_hist_array, color=color, bins=top_bins, histtype='step', lw=3) axHisty.hist(cov_hist_array, color=color, bins=right_bins, histtype='step', orientation='horizontal', lw=3) axScatter.scatter(gc_array, cov_array, color=color, s=s_array, lw=lw, alpha=alpha, edgecolor=black, label=label) axLegend.axis('off') if (multi_plot): # MULTI-PLOT!!! axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=fontsize, frameon=True) plot_ref_legend(axScatter) #plt.savefig(outfile + "." + str(i) + "_"+ tax.replace("/","") + "." + fig_format, format=fig_format) plt.savefig(outfile + "." + str(i) + "_" + tax.replace("/", "") + "." + fig_format, format=fig_format) if ignore_contig_len: pass else: # print scale-legend plot_ref_legend(axScatter) axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=fontsize, frameon=True, loc=6) sys.stdout.write("Saving file " + outfile) plt.savefig(outfile + "." + fig_format, format=fig_format) plt.close() print " [Done]\n"