def graph_est(est, expname, tier, show_graphs, save_graphs, errors): # These imports are here so script can run on server w/out graphics import plotfunctions import matplotlib import seaborn # Drop any entry that is above the targeted taxa level filtered_names = list(filter_by_taxa(est.species,tier)) if len(est.species) > 100: min_abundances = numpy.mean(est.counts) min_abundances = 1000 print "Filtering out any estimated results under {} counts".format(min_abundances) else: min_abundances = 10 filtered_est = Dataset() for sp in filtered_names: if est.lookup_count(sp) > min_abundances: # filter out results with less than min abundances filtered_est.add_record(sp,est.lookup_count(sp),est.lookup_count(sp),est.lookup_size(sp)) present_species = [tax_dict.get_name_by_id(s) for s in filtered_est.species] present_est = list(filtered_est.abundance) # graph est abundances xmax = len(present_species) present_sp = [x.replace('_',' ') for x in present_species] x = numpy.array(range(0,xmax)) # sort based on est abundances all_filter = zip(present_sp,present_est) all_filter.sort( key=lambda x: x[1],reverse=True ) fil_sp,fil_est = zip(*all_filter) #fil_est = numpy.log(fil_est) print "Results that passed filter:" # calculate % for each result fil_per = [round(100*n/math.fsum(fil_est),1) for n in fil_est] all_display = zip(fil_sp,fil_est,fil_per) pprint.pprint(all_display) plotfunctions.plot_setup_pre( "{}-level estimated counts" .format(tier), xlabels = fil_sp, xticks = range(0,xmax), xrotation = -90, yaxislabel = 'Counts') plotfunctions.plot(x, fil_est, color='red', plot_type = 'scatter') matplotlib.pyplot.gca().set_ylim(bottom=0.) if save_graphs: plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_estabundances.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) return
def graph_est(est, expname, tier, show_graphs, save_graphs, errors): # These imports are here so script can run on server w/out graphics import plotfunctions import matplotlib import seaborn # Drop any entry that is above the targeted taxa level filtered_names = list(filter_by_taxa(est.species, tier)) if len(est.species) > 100: min_abundances = numpy.mean(est.counts) min_abundances = 1000 print "Filtering out any estimated results under {} counts".format( min_abundances) else: min_abundances = 10 filtered_est = Dataset() for sp in filtered_names: if est.lookup_count( sp ) > min_abundances: # filter out results with less than min abundances filtered_est.add_record(sp, est.lookup_count(sp), est.lookup_count(sp), est.lookup_size(sp)) present_species = [ tax_dict.get_name_by_id(s) for s in filtered_est.species ] present_est = list(filtered_est.abundance) # graph est abundances xmax = len(present_species) present_sp = [x.replace('_', ' ') for x in present_species] x = numpy.array(range(0, xmax)) # sort based on est abundances all_filter = zip(present_sp, present_est) all_filter.sort(key=lambda x: x[1], reverse=True) fil_sp, fil_est = zip(*all_filter) #fil_est = numpy.log(fil_est) print "Results that passed filter:" # calculate % for each result fil_per = [round(100 * n / math.fsum(fil_est), 1) for n in fil_est] all_display = zip(fil_sp, fil_est, fil_per) pprint.pprint(all_display) plotfunctions.plot_setup_pre("{}-level estimated counts".format(tier), xlabels=fil_sp, xticks=range(0, xmax), xrotation=-90, yaxislabel='Counts') plotfunctions.plot(x, fil_est, color='red', plot_type='scatter') matplotlib.pyplot.gca().set_ylim(bottom=0.) if save_graphs: plotfunctions.plot_setup_post(save_file=expname + '_' + tier + '_estabundances.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) return
def graph_error(truth, est, adjusted_abundance, diff, expname, tier, norm_factor, show_graphs, save_graphs, errors, program='kallisto'): # These imports are here so script can run on server w/out graphics import plotfunctions import matplotlib import seaborn # format program name if program == 'clark': program = 'CLARK' if program == 'bracken': program = 'Bracken' # Drop any entry that is above the targeted taxa level # sometimes the strain-level filter filters out species-level genomes, so add truth back in filtered_names = list( filter_by_taxa(est.species, tier).union(set(truth.species))) filtered_est = Dataset() filtered_est.set_by_array([(sp, errors[sp], est.lookup_count(sp)) for sp in filtered_names]) # abundance field contains stdev errors if they exist truth.species = [tax_dict.get_name_by_id(s) for s in truth.species] truth.remake_index() filtered_est.species = [ tax_dict.get_name_by_id(s) for s in filtered_est.species ] filtered_est.counts = list(numpy.array(filtered_est.counts) * norm_factor) filtered_est.remake_index() all_species = filtered_est.species all_est = [ filtered_est.counts[filtered_est.species.index(sp)] if sp in filtered_est.species else 0 for sp in filtered_est.species ] all_true = [ truth.counts[truth.species.index(sp)] if sp in truth.species else 0 for sp in filtered_est.species ] all_diff = [] for i, a in enumerate(all_est): try: all_diff.append(100 * (a - all_true[i]) / max(a, all_true[i])) except ZeroDivisionError: all_diff.append( 0 ) # if both the estimate and the actual are 0, we're good here # graph true abundances if len(filtered_est.species) == len(truth.species): xmax = len(truth.species) x = numpy.array(range(0, xmax)) ab_filter = zip(truth.species, truth.counts, adjusted_abundance) ab_filter.sort(key=lambda x: x[1], reverse=True) ab_species, ab_true, ab_adjusted = zip(*ab_filter) plotfunctions.plot_setup_pre("{} estimated counts at {}-level".format( program, tier), xlabels=ab_species, xticks=range(0, xmax), xrotation=-90, yaxislabel='Counts') plotfunctions.plot(x, ab_true, color='blue', label='True') plotfunctions.plot(x, ab_adjusted, color='red', label='Estimated', plot_type='scatter') matplotlib.pyplot.gca().set_ylim(bottom=0.) # set x axis at y=0 if save_graphs: plotfunctions.plot_setup_post(save_file=expname + '_' + tier + '_counts.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) # graph abundances for all species, not just the true ones, if above mean else: if len(est.species) - filtered_est.counts.count(0) > len( truth.species) * 1.25: mean_ab = min( truth.counts) / 10 #10% of lowest actual count in truth print "Filtering out any estimated results under {} counts".format( mean_ab) else: mean_ab = 1 true = Dataset() true.species = all_species true.counts = all_true est = Dataset() est.species = all_species est.counts = all_est present = zip(all_species, all_true, all_est) present.sort(key=lambda x: x[1], reverse=True) present_species, present_true, present_est = zip(*present) present_errors = [0] * 500 present_true = [] present_est = [] present_species = [] present_errors = [] for i, sp in enumerate(filtered_est.species): if sp in truth.species or filtered_est.lookup_count( sp ) > mean_ab: #only include species if it has a non-zero estimate or non-zero actual abundance present_species.append(sp.replace('_', ' ')) # presentation names present_errors.append(filtered_est.lookup_abundance(sp)) present_est.append(filtered_est.lookup_count(sp)) present_true.append(truth.lookup_count(sp)) xmax = len(present_species) x = numpy.array(range(0, xmax)) # sort based on true counts, then sort false positives by est counts all_sort = zip(present_species, present_true, present_est, present_errors) all_sort.sort(key=lambda x: x[2], reverse=True) all_sort.sort(key=lambda x: x[1], reverse=True) #all_sort.sort(key=lambda x: x[0]) #sort by name for troubleshooting present_species, present_true, present_est, present_errors = zip( *all_sort) plotfunctions.plot_setup_pre("{} estimated counts at {}-level".format( program, tier), xticks=range(0, xmax), xrotation=-90, yaxislabel='Counts', xlabels=present_species) plotfunctions.plot(x, present_true, color='blue', label="True") plotfunctions.plot(x, present_est, plot_type='error', color='red', label="Estimated", fmt='o', yerr=present_errors) matplotlib.pyplot.gca().set_ylim(bottom=0.) if save_graphs: plotfunctions.plot_setup_post(save_file=expname + '_' + tier + '_sigcounts.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) return
def graph_error(truth, est, adjusted_abundance, diff, expname, tier, norm_factor, show_graphs, save_graphs, errors, program='kallisto'): # These imports are here so script can run on server w/out graphics import plotfunctions import matplotlib import seaborn # format program name if program == 'clark': program = 'CLARK' if program == 'bracken': program = 'Bracken' # Drop any entry that is above the targeted taxa level # sometimes the strain-level filter filters out species-level genomes, so add truth back in filtered_names = list(filter_by_taxa(est.species,tier).union(set(truth.species))) filtered_est = Dataset() filtered_est.set_by_array([(sp,errors[sp],est.lookup_count(sp)) for sp in filtered_names]) # abundance field contains stdev errors if they exist truth.species = [tax_dict.get_name_by_id(s) for s in truth.species] truth.remake_index() filtered_est.species = [tax_dict.get_name_by_id(s) for s in filtered_est.species] filtered_est.counts = list(numpy.array(filtered_est.counts)*norm_factor) filtered_est.remake_index() all_species = filtered_est.species all_est = [filtered_est.counts[filtered_est.species.index(sp)] if sp in filtered_est.species else 0 for sp in filtered_est.species] all_true = [truth.counts[truth.species.index(sp)] if sp in truth.species else 0 for sp in filtered_est.species] all_diff = [] for i,a in enumerate(all_est): try: all_diff.append(100*(a - all_true[i]) / max(a,all_true[i])) except ZeroDivisionError: all_diff.append(0) # if both the estimate and the actual are 0, we're good here # graph true abundances if len(filtered_est.species) == len(truth.species): xmax = len(truth.species) x = numpy.array(range(0,xmax)) ab_filter = zip(truth.species,truth.counts,adjusted_abundance) ab_filter.sort( key=lambda x: x[1],reverse=True ) ab_species,ab_true,ab_adjusted = zip(*ab_filter) plotfunctions.plot_setup_pre("{} estimated counts at {}-level" .format(program,tier), xlabels = ab_species, xticks = range(0,xmax), xrotation = -90, yaxislabel = 'Counts') plotfunctions.plot(x, ab_true, color='blue', label='True') plotfunctions.plot(x,ab_adjusted, color='red', label='Estimated', plot_type = 'scatter') matplotlib.pyplot.gca().set_ylim(bottom=0.) # set x axis at y=0 if save_graphs: plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_counts.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) # graph abundances for all species, not just the true ones, if above mean else: if len(est.species) - filtered_est.counts.count(0) > len(truth.species)*1.25: mean_ab = min(truth.counts)/10 #10% of lowest actual count in truth print "Filtering out any estimated results under {} counts".format(mean_ab) else: mean_ab = 1 true = Dataset() true.species = all_species true.counts = all_true est = Dataset() est.species = all_species est.counts = all_est present = zip(all_species,all_true,all_est) present.sort(key=lambda x: x[1], reverse=True) present_species, present_true, present_est = zip(*present) present_errors = [0]*500 present_true = [] present_est = [] present_species = [] present_errors = [] for i,sp in enumerate(filtered_est.species): if sp in truth.species or filtered_est.lookup_count(sp)>mean_ab: #only include species if it has a non-zero estimate or non-zero actual abundance present_species.append(sp.replace('_',' ')) # presentation names present_errors.append(filtered_est.lookup_abundance(sp)) present_est.append(filtered_est.lookup_count(sp)) present_true.append(truth.lookup_count(sp)) xmax = len(present_species) x = numpy.array(range(0,xmax)) # sort based on true counts, then sort false positives by est counts all_sort = zip(present_species,present_true,present_est,present_errors) all_sort.sort( key=lambda x: x[2],reverse=True ) all_sort.sort( key=lambda x: x[1],reverse=True ) #all_sort.sort(key=lambda x: x[0]) #sort by name for troubleshooting present_species,present_true,present_est,present_errors = zip(*all_sort) plotfunctions.plot_setup_pre( "{} estimated counts at {}-level" .format(program,tier), xticks = range(0,xmax), xrotation = -90, yaxislabel = 'Counts', xlabels = present_species) plotfunctions.plot(x, present_true, color='blue', label="True") plotfunctions.plot(x, present_est, plot_type = 'error', color='red', label="Estimated", fmt='o', yerr=present_errors) matplotlib.pyplot.gca().set_ylim(bottom=0.) if save_graphs: plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_sigcounts.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) return
def graph_est(est, expname, tier, show_graphs, save_graphs, errors): # These imports are here so script can run on server w/out graphics import plotfunctions import matplotlib import seaborn # Drop any entry that is above the targeted taxa level try: filtered_names = list(filter_by_taxa(est.species,tier)) except: filtered_names = est.species # for functional or transcript level analysis if len(est.species) > 100: min_abundances = numpy.mean(est.counts) min_abundances = 1000 print "Filtering out any estimated results under {} counts".format(min_abundances) else: min_abundances = 10 filtered_est = Dataset() for sp in filtered_names: if est.lookup_count(sp) > min_abundances: # filter out results with less than min abundances filtered_est.add_record(sp,est.lookup_count(sp),est.lookup_count(sp),est.lookup_size(sp)) if not transcripts and not functional: present_species = [tax_dict.get_name_by_id(s) for s in filtered_est.species] else: present_species = filtered_est.species present_est = list(filtered_est.abundance) # graph est abundances xmax = len(present_species) present_sp = [x.replace('_',' ') for x in present_species] x = numpy.array(range(0,xmax)) # sort based on est abundances all_filter = zip(present_sp,present_est) all_filter.sort( key=lambda x: x[1],reverse=True ) fil_sp,fil_est = zip(*all_filter) #fil_est = numpy.log(fil_est) print "Results that passed filter:" # calculate % for each result fil_per = [round(100*n/math.fsum(fil_est),1) for n in fil_est] all_display = zip(fil_sp,fil_est,fil_per) #pprint.pprint(all_display) # print present species with open('species_hits.txt','w') as cutoff_file: for ab in all_display: cutoff_file.write("{},{}\n".format(ab[0],ab[1])) # for importing into excel to produce functional plot #for f in fil_sp: # print f #for f in fil_per: # print f if transcripts: title = "Human gut metatranscriptome estimated counts" elif functional: title = "Human gut metatranscriptome functional KEGG categories" else: title = "Human gut metagenome {}-level estimated counts".format(tier) plotfunctions.plot_setup_pre( xlabels = fil_sp, xticks = range(0,xmax), xrotation = 90, yaxislabel = 'Counts') plotfunctions.plot(x, fil_est, color='red', plot_type = 'scatter') matplotlib.pyplot.gca().set_ylim(bottom=0.) if save_graphs: plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_estabundances.png', show=show_graphs) else: plotfunctions.plot_setup_post(legend=False) return