species_name = sys.argv[1] debug = True min_coverage = 20 # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") ### # Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities clipped_pis = (total_pis + 1) / (total_pi_opportunities + 1)
# Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") species_idx = 0 for species_name in species_names: species_name_items = species_name.split("_") species_label = "_".join([species_name_items[0]]+species_name_items[1:]) # Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(species_name) median_coverages = numpy.array([stats_utils.calculate_nonzero_median_from_histogram(sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms]) sample_coverage_map = {samples[i]: median_coverages[i] for i in xrange(0,len(samples))} # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(species_name, debug=debug) sys.stderr.write("Done!\n") pis = total_pis/total_pi_opportunities median_coverages = numpy.array([sample_coverage_map[samples[i]] for i in xrange(0,len(samples))]) # Only plot samples above a certain depth threshold that are "haploids" desired_samples = samples[(median_coverages>=min_coverage)*(pis<=1e-03)] desired_median_coverages = median_coverages[(median_coverages>=min_coverage)*(pis<=1e-03)] if len(desired_samples) < 2: