Ejemplo n.º 1
0
                       clipped_pis[same_subject_idxs[1]])
upper_pis = numpy.fmax(clipped_pis[same_subject_idxs[0]],
                       clipped_pis[same_subject_idxs[1]])

# Only plot samples above a certain depth threshold that are "haploids"
desired_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

###

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
    species_name, debug=debug, allowed_samples=desired_samples)
sys.stderr.write("Done!\n")

pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')

low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03)

unique_samples = parse_midas_data.calculate_unique_samples(
    subject_sample_map, samples)

desired_samples = unique_samples * low_diversity_samples

# initialize distance bins for LD computations
distance_bins = numpy.logspace(
    0, 4, 20
)  # bins start from 1 to 10^4 and there are 20 evenly spaced bins log(1)=0, log(10^4)-4
distance_bin_locations = numpy.array(
    distance_bins[:-1],
    copy=True)  # shifted one to avoid edge effects for plotting.
}

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Calculate full matrix of synonymous pairwise differences
sys.stderr.write("Calculate synonymous pi matrix...\n")
pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map,
    passed_sites_map,
    variant_type='4D',
    allowed_genes=metaphlan2_genes)

# Calculate fixation matrix
fixation_matrix_syn = diversity_utils.calculate_fixation_matrix(
    allele_counts_map,
    passed_sites_map,
    variant_type='4D',
    min_change=min_change,
    allowed_genes=metaphlan2_genes)

sys.stderr.write("Done!\n")

# Calculate full matrix of nonsynonymous pairwise differences
sys.stderr.write("Calculate nonsynonymous pi matrix...\n")
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

locations = []
pi_syns = []
pi_nons = []
for gene_name in passed_sites_map.keys():

    passed_sites = passed_sites_map[gene_name]['4D']['sites']
    passed_pairs = (passed_sites > 0.5)
    passed_pairs[numpy.diag_indices_from(passed_pairs)] = False
    pi_matrix, avg_pi_matrix = diversity_utils.calculate_pi_matrix(
        allele_counts_map,
        passed_sites_map,
        variant_type='4D',
        allowed_genes=set([gene_name]))
    total_pairs = passed_pairs.sum()

    if total_pairs < 0.5:
        continue

    avg_pi = (pi_matrix * passed_pairs).sum() / (total_pairs +
                                                 (total_pairs < 0.5))

    pi_syns.append(avg_pi)
    locations.append(passed_sites_map[gene_name]['4D']['location'])

    passed_sites = passed_sites_map[gene_name]['1D']['sites']
    passed_pairs = (passed_sites > 0.5)