def graph_est(est, expname, tier, show_graphs, save_graphs, errors):
	# These imports are here so script can run on server w/out graphics
	import plotfunctions
	import matplotlib
	import seaborn

	# Drop any entry that is above the targeted taxa level
	filtered_names = list(filter_by_taxa(est.species,tier))

	if len(est.species) > 100:
		min_abundances = numpy.mean(est.counts)
		min_abundances = 1000
		print "Filtering out any estimated results under {} counts".format(min_abundances)
	else:
		min_abundances = 10
	filtered_est = Dataset()
	for sp in filtered_names:
		if est.lookup_count(sp) > min_abundances: # filter out results with less than min abundances
			filtered_est.add_record(sp,est.lookup_count(sp),est.lookup_count(sp),est.lookup_size(sp))

	present_species = [tax_dict.get_name_by_id(s) for s in filtered_est.species]
	present_est = list(filtered_est.abundance)

	# graph est abundances
	xmax = len(present_species)
	present_sp = [x.replace('_',' ') for x in present_species]
	x = numpy.array(range(0,xmax))

	# sort based on est abundances
	all_filter = zip(present_sp,present_est)
	all_filter.sort( key=lambda x: x[1],reverse=True )
	fil_sp,fil_est = zip(*all_filter)
	#fil_est = numpy.log(fil_est)

	print "Results that passed filter:"
	# calculate % for each result
	fil_per = [round(100*n/math.fsum(fil_est),1) for n in fil_est]
	all_display = zip(fil_sp,fil_est,fil_per)
	pprint.pprint(all_display)

	plotfunctions.plot_setup_pre(
		"{}-level estimated counts"
		.format(tier), xlabels = fil_sp, xticks = range(0,xmax),
		xrotation = -90, yaxislabel = 'Counts')

	plotfunctions.plot(x, fil_est, color='red', plot_type = 'scatter')
	matplotlib.pyplot.gca().set_ylim(bottom=0.)
	if save_graphs:
		plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_estabundances.png', show=show_graphs)
	else:
		plotfunctions.plot_setup_post(legend=False)

	return
Ejemplo n.º 2
0
def graph_est(est, expname, tier, show_graphs, save_graphs, errors):
    # These imports are here so script can run on server w/out graphics
    import plotfunctions
    import matplotlib
    import seaborn

    # Drop any entry that is above the targeted taxa level
    filtered_names = list(filter_by_taxa(est.species, tier))

    if len(est.species) > 100:
        min_abundances = numpy.mean(est.counts)
        min_abundances = 1000
        print "Filtering out any estimated results under {} counts".format(
            min_abundances)
    else:
        min_abundances = 10
    filtered_est = Dataset()
    for sp in filtered_names:
        if est.lookup_count(
                sp
        ) > min_abundances:  # filter out results with less than min abundances
            filtered_est.add_record(sp, est.lookup_count(sp),
                                    est.lookup_count(sp), est.lookup_size(sp))

    present_species = [
        tax_dict.get_name_by_id(s) for s in filtered_est.species
    ]
    present_est = list(filtered_est.abundance)

    # graph est abundances
    xmax = len(present_species)
    present_sp = [x.replace('_', ' ') for x in present_species]
    x = numpy.array(range(0, xmax))

    # sort based on est abundances
    all_filter = zip(present_sp, present_est)
    all_filter.sort(key=lambda x: x[1], reverse=True)
    fil_sp, fil_est = zip(*all_filter)
    #fil_est = numpy.log(fil_est)

    print "Results that passed filter:"
    # calculate % for each result
    fil_per = [round(100 * n / math.fsum(fil_est), 1) for n in fil_est]
    all_display = zip(fil_sp, fil_est, fil_per)
    pprint.pprint(all_display)

    plotfunctions.plot_setup_pre("{}-level estimated counts".format(tier),
                                 xlabels=fil_sp,
                                 xticks=range(0, xmax),
                                 xrotation=-90,
                                 yaxislabel='Counts')

    plotfunctions.plot(x, fil_est, color='red', plot_type='scatter')
    matplotlib.pyplot.gca().set_ylim(bottom=0.)
    if save_graphs:
        plotfunctions.plot_setup_post(save_file=expname + '_' + tier +
                                      '_estabundances.png',
                                      show=show_graphs)
    else:
        plotfunctions.plot_setup_post(legend=False)

    return
Ejemplo n.º 3
0
def graph_error(truth,
                est,
                adjusted_abundance,
                diff,
                expname,
                tier,
                norm_factor,
                show_graphs,
                save_graphs,
                errors,
                program='kallisto'):
    # These imports are here so script can run on server w/out graphics
    import plotfunctions
    import matplotlib
    import seaborn

    # format program name
    if program == 'clark':
        program = 'CLARK'
    if program == 'bracken':
        program = 'Bracken'

    # Drop any entry that is above the targeted taxa level
    # sometimes the strain-level filter filters out species-level genomes, so add truth back in
    filtered_names = list(
        filter_by_taxa(est.species, tier).union(set(truth.species)))

    filtered_est = Dataset()
    filtered_est.set_by_array([(sp, errors[sp], est.lookup_count(sp))
                               for sp in filtered_names])
    # abundance field contains stdev errors if they exist

    truth.species = [tax_dict.get_name_by_id(s) for s in truth.species]
    truth.remake_index()
    filtered_est.species = [
        tax_dict.get_name_by_id(s) for s in filtered_est.species
    ]
    filtered_est.counts = list(numpy.array(filtered_est.counts) * norm_factor)
    filtered_est.remake_index()

    all_species = filtered_est.species
    all_est = [
        filtered_est.counts[filtered_est.species.index(sp)]
        if sp in filtered_est.species else 0 for sp in filtered_est.species
    ]
    all_true = [
        truth.counts[truth.species.index(sp)] if sp in truth.species else 0
        for sp in filtered_est.species
    ]

    all_diff = []
    for i, a in enumerate(all_est):
        try:
            all_diff.append(100 * (a - all_true[i]) / max(a, all_true[i]))
        except ZeroDivisionError:
            all_diff.append(
                0
            )  # if both the estimate and the actual are 0, we're good here

    # graph true abundances
    if len(filtered_est.species) == len(truth.species):
        xmax = len(truth.species)
        x = numpy.array(range(0, xmax))

        ab_filter = zip(truth.species, truth.counts, adjusted_abundance)
        ab_filter.sort(key=lambda x: x[1], reverse=True)
        ab_species, ab_true, ab_adjusted = zip(*ab_filter)

        plotfunctions.plot_setup_pre("{} estimated counts at {}-level".format(
            program, tier),
                                     xlabels=ab_species,
                                     xticks=range(0, xmax),
                                     xrotation=-90,
                                     yaxislabel='Counts')

        plotfunctions.plot(x, ab_true, color='blue', label='True')
        plotfunctions.plot(x,
                           ab_adjusted,
                           color='red',
                           label='Estimated',
                           plot_type='scatter')
        matplotlib.pyplot.gca().set_ylim(bottom=0.)  # set x axis at y=0
        if save_graphs:
            plotfunctions.plot_setup_post(save_file=expname + '_' + tier +
                                          '_counts.png',
                                          show=show_graphs)
        else:
            plotfunctions.plot_setup_post(legend=False)

    # graph abundances for all species, not just the true ones, if above mean
    else:
        if len(est.species) - filtered_est.counts.count(0) > len(
                truth.species) * 1.25:
            mean_ab = min(
                truth.counts) / 10  #10% of lowest actual count in truth
            print "Filtering out any estimated results under {} counts".format(
                mean_ab)
        else:
            mean_ab = 1

        true = Dataset()
        true.species = all_species
        true.counts = all_true
        est = Dataset()
        est.species = all_species
        est.counts = all_est

        present = zip(all_species, all_true, all_est)
        present.sort(key=lambda x: x[1], reverse=True)

        present_species, present_true, present_est = zip(*present)
        present_errors = [0] * 500

        present_true = []
        present_est = []
        present_species = []
        present_errors = []

        for i, sp in enumerate(filtered_est.species):
            if sp in truth.species or filtered_est.lookup_count(
                    sp
            ) > mean_ab:  #only include species if it has a non-zero estimate or non-zero actual abundance
                present_species.append(sp.replace('_',
                                                  ' '))  # presentation names
                present_errors.append(filtered_est.lookup_abundance(sp))
                present_est.append(filtered_est.lookup_count(sp))
                present_true.append(truth.lookup_count(sp))

        xmax = len(present_species)
        x = numpy.array(range(0, xmax))

        # sort based on true counts, then sort false positives by est counts
        all_sort = zip(present_species, present_true, present_est,
                       present_errors)
        all_sort.sort(key=lambda x: x[2], reverse=True)
        all_sort.sort(key=lambda x: x[1], reverse=True)
        #all_sort.sort(key=lambda x: x[0]) #sort by name for troubleshooting

        present_species, present_true, present_est, present_errors = zip(
            *all_sort)

        plotfunctions.plot_setup_pre("{} estimated counts at {}-level".format(
            program, tier),
                                     xticks=range(0, xmax),
                                     xrotation=-90,
                                     yaxislabel='Counts',
                                     xlabels=present_species)

        plotfunctions.plot(x, present_true, color='blue', label="True")
        plotfunctions.plot(x,
                           present_est,
                           plot_type='error',
                           color='red',
                           label="Estimated",
                           fmt='o',
                           yerr=present_errors)
        matplotlib.pyplot.gca().set_ylim(bottom=0.)
        if save_graphs:
            plotfunctions.plot_setup_post(save_file=expname + '_' + tier +
                                          '_sigcounts.png',
                                          show=show_graphs)
        else:
            plotfunctions.plot_setup_post(legend=False)

    return
def graph_error(truth, est, adjusted_abundance, diff, expname, tier, norm_factor, show_graphs, save_graphs, errors, program='kallisto'):
	# These imports are here so script can run on server w/out graphics
	import plotfunctions
	import matplotlib
	import seaborn

	# format program name
	if program == 'clark':
		program = 'CLARK'
	if program == 'bracken':
		program = 'Bracken'

	# Drop any entry that is above the targeted taxa level
	# sometimes the strain-level filter filters out species-level genomes, so add truth back in
	filtered_names = list(filter_by_taxa(est.species,tier).union(set(truth.species)))

	filtered_est = Dataset()
	filtered_est.set_by_array([(sp,errors[sp],est.lookup_count(sp)) for sp in filtered_names])
	# abundance field contains stdev errors if they exist

	truth.species = [tax_dict.get_name_by_id(s) for s in truth.species]
	truth.remake_index()
	filtered_est.species = [tax_dict.get_name_by_id(s) for s in filtered_est.species]
	filtered_est.counts = list(numpy.array(filtered_est.counts)*norm_factor)
	filtered_est.remake_index()

	all_species = filtered_est.species
	all_est = [filtered_est.counts[filtered_est.species.index(sp)] if sp in filtered_est.species else 0 for sp in filtered_est.species]
	all_true = [truth.counts[truth.species.index(sp)] if sp in truth.species else 0 for sp in filtered_est.species]

	all_diff = []
	for i,a in enumerate(all_est):
		try:
			all_diff.append(100*(a - all_true[i]) / max(a,all_true[i]))
		except ZeroDivisionError:
			all_diff.append(0) # if both the estimate and the actual are 0, we're good here

	# graph true abundances
	if len(filtered_est.species) == len(truth.species):
		xmax = len(truth.species)
		x = numpy.array(range(0,xmax))

		ab_filter = zip(truth.species,truth.counts,adjusted_abundance)
		ab_filter.sort( key=lambda x: x[1],reverse=True )
		ab_species,ab_true,ab_adjusted = zip(*ab_filter)

		plotfunctions.plot_setup_pre("{} estimated counts at {}-level"
			.format(program,tier), xlabels = ab_species,
			xticks = range(0,xmax), xrotation = -90, yaxislabel = 'Counts')

		plotfunctions.plot(x, ab_true, color='blue', label='True')
		plotfunctions.plot(x,ab_adjusted, color='red', label='Estimated', plot_type = 'scatter')
		matplotlib.pyplot.gca().set_ylim(bottom=0.) # set x axis at y=0
		if save_graphs:
			plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_counts.png', show=show_graphs)
		else:
			plotfunctions.plot_setup_post(legend=False)

	# graph abundances for all species, not just the true ones, if above mean
	else:
		if len(est.species) - filtered_est.counts.count(0) > len(truth.species)*1.25:
				mean_ab = min(truth.counts)/10 #10% of lowest actual count in truth
				print "Filtering out any estimated results under {} counts".format(mean_ab)
		else:
			mean_ab = 1

		true = Dataset()
		true.species = all_species
		true.counts = all_true
		est = Dataset()
		est.species = all_species
		est.counts = all_est

		present = zip(all_species,all_true,all_est)
		present.sort(key=lambda x: x[1], reverse=True)

		present_species, present_true, present_est = zip(*present)
		present_errors = [0]*500

		present_true = []
		present_est = []
		present_species = []
		present_errors = []

		for i,sp in enumerate(filtered_est.species):
			if sp in truth.species or filtered_est.lookup_count(sp)>mean_ab: #only include species if it has a non-zero estimate or non-zero actual abundance
				present_species.append(sp.replace('_',' ')) # presentation names
				present_errors.append(filtered_est.lookup_abundance(sp))
				present_est.append(filtered_est.lookup_count(sp))
				present_true.append(truth.lookup_count(sp))

		xmax = len(present_species)
		x = numpy.array(range(0,xmax))

		# sort based on true counts, then sort false positives by est counts
		all_sort = zip(present_species,present_true,present_est,present_errors)
		all_sort.sort( key=lambda x: x[2],reverse=True )
		all_sort.sort( key=lambda x: x[1],reverse=True )
		#all_sort.sort(key=lambda x: x[0]) #sort by name for troubleshooting

		present_species,present_true,present_est,present_errors = zip(*all_sort)

		plotfunctions.plot_setup_pre(
			"{} estimated counts at {}-level"
			.format(program,tier), xticks = range(0,xmax),
			xrotation = -90, yaxislabel = 'Counts', xlabels = present_species)

		plotfunctions.plot(x, present_true, color='blue', label="True")
		plotfunctions.plot(x, present_est, plot_type = 'error', color='red', label="Estimated", fmt='o', yerr=present_errors)
		matplotlib.pyplot.gca().set_ylim(bottom=0.)
		if save_graphs:
			plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_sigcounts.png', show=show_graphs)
		else:
			plotfunctions.plot_setup_post(legend=False)

	return
def graph_est(est, expname, tier, show_graphs, save_graphs, errors):
	# These imports are here so script can run on server w/out graphics
	import plotfunctions
	import matplotlib
	import seaborn

	# Drop any entry that is above the targeted taxa level
	try:
		filtered_names = list(filter_by_taxa(est.species,tier))
	except:
		filtered_names = est.species # for functional or transcript level analysis

	if len(est.species) > 100:
		min_abundances = numpy.mean(est.counts)
		min_abundances = 1000
		print "Filtering out any estimated results under {} counts".format(min_abundances)
	else:
		min_abundances = 10
	filtered_est = Dataset()
	for sp in filtered_names:
		if est.lookup_count(sp) > min_abundances: # filter out results with less than min abundances
			filtered_est.add_record(sp,est.lookup_count(sp),est.lookup_count(sp),est.lookup_size(sp))

	if not transcripts and not functional:
		present_species = [tax_dict.get_name_by_id(s) for s in filtered_est.species]
	else:
		present_species = filtered_est.species
	present_est = list(filtered_est.abundance)

	# graph est abundances
	xmax = len(present_species)
	present_sp = [x.replace('_',' ') for x in present_species]
	x = numpy.array(range(0,xmax))

	# sort based on est abundances

	all_filter = zip(present_sp,present_est)
	all_filter.sort( key=lambda x: x[1],reverse=True )
	fil_sp,fil_est = zip(*all_filter)
	#fil_est = numpy.log(fil_est)

	print "Results that passed filter:"
	# calculate % for each result
	fil_per = [round(100*n/math.fsum(fil_est),1) for n in fil_est]
	all_display = zip(fil_sp,fil_est,fil_per)
	#pprint.pprint(all_display)

	# print present species
	with open('species_hits.txt','w') as cutoff_file:
		for ab in all_display:
			cutoff_file.write("{},{}\n".format(ab[0],ab[1]))

	# for importing into excel to produce functional plot
	#for f in fil_sp:
	#	print f
	#for f in fil_per:
	#	print f
		
	if transcripts:
		title = "Human gut metatranscriptome estimated counts"
	elif functional:
		title = "Human gut metatranscriptome functional KEGG categories"
	else:
		title = "Human gut metagenome {}-level estimated counts".format(tier)
	plotfunctions.plot_setup_pre(
		xlabels = fil_sp, xticks = range(0,xmax),
		xrotation = 90, yaxislabel = 'Counts')

	plotfunctions.plot(x, fil_est, color='red', plot_type = 'scatter')
	matplotlib.pyplot.gca().set_ylim(bottom=0.)
	if save_graphs:
		plotfunctions.plot_setup_post(save_file = expname +'_'+ tier +'_estabundances.png', show=show_graphs)
	else:
		plotfunctions.plot_setup_post(legend=False)
	
	return