Esempio n. 1
0
species_name = sys.argv[1]
debug = True
min_coverage = 20

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

###

# Load genomic coverage distributions
sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(
    species_name)
median_coverages = numpy.array([
    stats_utils.calculate_nonzero_median_from_histogram(
        sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(
    species_name, debug)
sys.stderr.write("Done!\n")
pis = total_pis / total_pi_opportunities
clipped_pis = (total_pis + 1) / (total_pi_opportunities + 1)
Esempio n. 2
0
# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

species_idx = 0

for species_name in species_names:
    
    species_name_items = species_name.split("_")
    
    species_label = "_".join([species_name_items[0]]+species_name_items[1:])
    
    # Load genomic coverage distributions
    sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(species_name)
    median_coverages = numpy.array([stats_utils.calculate_nonzero_median_from_histogram(sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms])
    sample_coverage_map = {samples[i]: median_coverages[i] for i in xrange(0,len(samples))}

    # Load pi information for species_name
    sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
    samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(species_name, debug=debug)
    sys.stderr.write("Done!\n")
    pis = total_pis/total_pi_opportunities

    median_coverages = numpy.array([sample_coverage_map[samples[i]] for i in xrange(0,len(samples))])
    
    # Only plot samples above a certain depth threshold that are "haploids"
    desired_samples = samples[(median_coverages>=min_coverage)*(pis<=1e-03)]
    desired_median_coverages = median_coverages[(median_coverages>=min_coverage)*(pis<=1e-03)]

    if len(desired_samples) < 2: