} # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, combination_type="sample", debug=False) sys.stderr.write("Done!\n") median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Calculate full matrix of synonymous pairwise differences sys.stderr.write("Calculate synonymous pi matrix...\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D', allowed_genes=metaphlan2_genes) # Calculate fixation matrix fixation_matrix_syn = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map, variant_type='4D', min_change=min_change, allowed_genes=metaphlan2_genes) sys.stderr.write("Done!\n") # Calculate full matrix of nonsynonymous pairwise differences sys.stderr.write("Calculate nonsynonymous pi matrix...\n")
from parsers import parse_midas_data from numpy.random import choice species_name = sys.argv[1] # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, combination_type="sample", debug=False) sys.stderr.write("Done!\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D') low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03) unique_samples = parse_midas_data.calculate_unique_samples( subject_sample_map, samples) desired_samples = unique_samples * low_diversity_samples # initialize distance bins for LD computations distance_bins = numpy.logspace( 0, 4, 20 ) # bins start from 1 to 10^4 and there are 20 evenly spaced bins log(1)=0, log(10^4)-4 distance_bin_locations = numpy.array( distance_bins[:-1], copy=True) # shifted one to avoid edge effects for plotting.
sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, combination_type="sample", debug=False) sys.stderr.write("Done!\n") locations = [] pi_syns = [] pi_nons = [] for gene_name in passed_sites_map.keys(): passed_sites = passed_sites_map[gene_name]['4D']['sites'] passed_pairs = (passed_sites > 0.5) passed_pairs[numpy.diag_indices_from(passed_pairs)] = False pi_matrix, avg_pi_matrix = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D', allowed_genes=set([gene_name])) total_pairs = passed_pairs.sum() if total_pairs < 0.5: continue avg_pi = (pi_matrix * passed_pairs).sum() / (total_pairs + (total_pairs < 0.5)) pi_syns.append(avg_pi) locations.append(passed_sites_map[gene_name]['4D']['location']) passed_sites = passed_sites_map[gene_name]['1D']['sites'] passed_pairs = (passed_sites > 0.5)