}

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Calculate full matrix of synonymous pairwise differences
sys.stderr.write("Calculate synonymous pi matrix...\n")
pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map,
    passed_sites_map,
    variant_type='4D',
    allowed_genes=metaphlan2_genes)

# Calculate fixation matrix
fixation_matrix_syn = diversity_utils.calculate_fixation_matrix(
    allele_counts_map,
    passed_sites_map,
    variant_type='4D',
    min_change=min_change,
    allowed_genes=metaphlan2_genes)

sys.stderr.write("Done!\n")

# Calculate full matrix of nonsynonymous pairwise differences
sys.stderr.write("Calculate nonsynonymous pi matrix...\n")
from parsers import parse_midas_data
from numpy.random import choice
species_name = sys.argv[1]

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')

low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03)

unique_samples = parse_midas_data.calculate_unique_samples(
    subject_sample_map, samples)

desired_samples = unique_samples * low_diversity_samples

# initialize distance bins for LD computations
distance_bins = numpy.logspace(
    0, 4, 20
)  # bins start from 1 to 10^4 and there are 20 evenly spaced bins log(1)=0, log(10^4)-4
distance_bin_locations = numpy.array(
    distance_bins[:-1],
    copy=True)  # shifted one to avoid edge effects for plotting.
Exemple #3
0
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

locations = []
pi_syns = []
pi_nons = []
for gene_name in passed_sites_map.keys():

    passed_sites = passed_sites_map[gene_name]['4D']['sites']
    passed_pairs = (passed_sites > 0.5)
    passed_pairs[numpy.diag_indices_from(passed_pairs)] = False
    pi_matrix, avg_pi_matrix = diversity_utils.calculate_pi_matrix(
        allele_counts_map,
        passed_sites_map,
        variant_type='4D',
        allowed_genes=set([gene_name]))
    total_pairs = passed_pairs.sum()

    if total_pairs < 0.5:
        continue

    avg_pi = (pi_matrix * passed_pairs).sum() / (total_pairs +
                                                 (total_pairs < 0.5))

    pi_syns.append(avg_pi)
    locations.append(passed_sites_map[gene_name]['4D']['location'])

    passed_sites = passed_sites_map[gene_name]['1D']['sites']
    passed_pairs = (passed_sites > 0.5)