divergence_matrices[species_name] = snp_substitution_matrix

    between_divergences[species_name] = []
    for i in xrange(0, divergence_matrices[species_name].shape[0]):
        for j in xrange(i + 1, divergence_matrices[species_name].shape[0]):

            if divergence_matrices[species_name][i, j] >= 0:

                between_divergences[species_name].append(
                    divergence_matrices[species_name][i, j])
    between_divergences[species_name] = numpy.array(
        between_divergences[species_name])

    # Load SNP information for species_name
    sys.stderr.write("Loading SFSs for %s...\t" % species_name)
    sfs_samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name)
    sys.stderr.write("Done!\n")

    highcoverage_samples = diversity_utils.calculate_highcoverage_samples(
        species_name)
    desired_samples = snp_samples

    within_polymorphisms[species_name] = []
    for sample in desired_samples:
        within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(
            sfs_map[sample])
        within_polymorphisms[species_name].append(within_sites * 1.0 /
                                                  total_sites)

species_names = []
sample_sizes = []
Esempio n. 2
0
subject_sample_map = parse_HMP_data.parse_subject_sample_map()
sample_order_map = parse_HMP_data.parse_sample_order_map()
sys.stderr.write("Done!\n")

if other_species_str == "":
    good_species_list = parse_midas_data.parse_good_species_list()
else:
    good_species_list = [species_name]

# store all the species' data in a dictionary:
all_data = {}
#key=species
#value={}, key=gene, valuee=num times gene shows up

for species_name in good_species_list:
    dummy_samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name, allowed_variant_types=set(['1D', '2D', '3D', '4D']))
    #
    # data structures for storing information for pickling later on
    all_species_gene_changes = {}
    #all_species_gene_changes_category={}
    all_species_null = {}
    all_data[species_name] = {}
    #
    ####################
    # Analyze the data #
    ####################
    #
    # Only plot samples above a certain depth threshold that are "haploids"
    haploid_samples = diversity_utils.calculate_haploid_samples(species_name,
                                                                debug=debug)
    #
    species_name)
median_coverages = numpy.array([
    stats_utils.calculate_nonzero_median_from_histogram(
        sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, site_map = parse_midas_data.parse_within_sample_sfs(
    species_name,
    allowed_variant_types=set(['4D']),
    allowed_genes=core_genes,
    debug=debug)
sys.stderr.write("Done!\n")

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

print[len(site_map[i].keys()) for i in xrange(0, len(site_map))]

sys.exit(0)

# Only plot samples above a certain depth threshold that are "haploids"
snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

num_haploids = len(snp_samples)